From 426d31071ac476ea62c62656b242930c17b58c00 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Sat, 7 Aug 2010 12:30:03 +0200
Subject: fix printk typo 'faild'

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/exofs/inode.c   | 14 +++++++-------
 fs/exofs/ios.c     | 10 +++++-----
 fs/jfs/jfs_mount.c |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 4bb6ef822e46..5862ae87ed29 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -508,7 +508,7 @@ static int write_exec(struct page_collect *pcol)
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
 	if (!pcol_copy) {
-		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+		EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
 		ret = -ENOMEM;
 		goto err;
 	}
@@ -524,7 +524,7 @@ static int write_exec(struct page_collect *pcol)
 
 	ret = exofs_oi_write(oi, ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
+		EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
 		goto err;
 	}
 
@@ -625,7 +625,7 @@ try_again:
 		/* split the request, next loop will start again */
 		ret = write_exec(pcol);
 		if (unlikely(ret)) {
-			EXOFS_DBGMSG("write_exec faild => %d", ret);
+			EXOFS_DBGMSG("write_exec failed => %d", ret);
 			goto fail;
 		}
 
@@ -709,7 +709,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
 					 fsdata);
 		if (ret) {
-			EXOFS_DBGMSG("simple_write_begin faild\n");
+			EXOFS_DBGMSG("simple_write_begin failed\n");
 			return ret;
 		}
 
@@ -722,7 +722,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		if (ret) {
 			/*SetPageError was done by _readpage. Is it ok?*/
 			unlock_page(page);
-			EXOFS_DBGMSG("__readpage_filler faild\n");
+			EXOFS_DBGMSG("__readpage_filler failed\n");
 		}
 	}
 
@@ -1112,7 +1112,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
 	atomic_dec(&sbi->s_curr_pending);
 
 	if (unlikely(ret)) {
-		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+		EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
 			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
 		/*TODO: When FS is corrupted creation can fail, object already
 		 * exist. Get rid of this asynchronous creation, if exist
@@ -1232,7 +1232,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
 	if (!args) {
-		EXOFS_DBGMSG("Faild kzalloc of args\n");
+		EXOFS_DBGMSG("Failed kzalloc of args\n");
 		return -ENOMEM;
 	}
 
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4337cad7777b..95921f501f2f 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 
 	ret = osd_finalize_request(or, 0, cred, NULL);
 	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
 		goto out;
 	}
 
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
 	 */
 	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
-		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+		EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
 			     exofs_io_state_size(layout->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
 
 		ret = osd_finalize_request(or, 0, ios->cred, NULL);
 		if (unlikely(ret)) {
-			EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+			EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
 				     ret);
 			return ret;
 		}
@@ -365,7 +365,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
-			EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+			EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
 				     bio_size);
 			return -ENOMEM;
 		}
@@ -584,7 +584,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 						  master_dev->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
 					EXOFS_DBGMSG(
-					      "Faild to allocate BIO size=%u\n",
+					      "Failed to allocate BIO size=%u\n",
 					      master_dev->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45a..9895595fd2f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
 
 	ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
 	if (ipaimap == NULL) {
-		jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+		jfs_err("jfs_mount: Failed to read AGGREGATE_I");
 		rc = -EIO;
 		goto errout20;
 	}
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
 	if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
 		ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
 		if (!ipaimap2) {
-			jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+			jfs_err("jfs_mount: Failed to read AGGREGATE_I");
 			rc = -EIO;
 			goto errout35;
 		}
-- 
cgit v1.2.3


From 0378da0fda6edf5aaffda6f1248a78986bd955b5 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 12 Aug 2010 10:25:28 +0800
Subject: ocfs2: pass struct file* to ocfs2_write_begin_nolock.

struct file * has file_ra_state to store the readahead state
and data. So pass this to ocfs2_write_begin_nolock so that
it can be used in ocfs2_refcount_cow.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/aops.c | 5 +++--
 fs/ocfs2/aops.h | 3 ++-
 fs/ocfs2/mmap.c | 7 ++++---
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..e3efdd93773f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
 	return ret;
 }
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page)
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
 				       fsdata, di_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..7606f663da6d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata);
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe1f139..b04d6961c0d4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	return ret;
 }
 
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 				struct page *page)
 {
 	int ret;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t pos = page_offset(page);
 	unsigned int len = PAGE_CACHE_SIZE;
@@ -109,7 +110,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
 	if (page->index == last_index)
 		len = size & ~PAGE_CACHE_MASK;
 
-	ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
 				       &fsdata, di_bh, page);
 	if (ret) {
 		if (ret != -ENOSPC)
@@ -157,7 +158,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-- 
cgit v1.2.3


From b890823635e022467b924e3d9da8c5166a4e349c Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 12 Aug 2010 10:27:14 +0800
Subject: ocfs2: pass struct file* to ocfs2_prepare_inode_for_write.

struct file * has file_ra_state to store the readahead state
and data. So pass this to ocfs2_prepare_inode_for_write. so
that it can be used in ocfs2_refcount_cow.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/file.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 81296b4e3646..5920159a421f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2053,6 +2053,7 @@ out:
 }
 
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    struct file *file,
 					    loff_t pos, size_t count,
 					    int *meta_level)
 {
@@ -2078,7 +2079,7 @@ out:
 	return ret;
 }
 
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
 					 loff_t *ppos,
 					 size_t count,
 					 int appending,
@@ -2086,6 +2087,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 int *has_refcount)
 {
 	int ret = 0, meta_level = 0;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	loff_t saved_pos, end;
 
@@ -2141,6 +2143,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 			meta_level = -1;
 
 			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       file,
 							       saved_pos,
 							       count,
 							       &meta_level);
@@ -2255,7 +2258,7 @@ relock:
 	}
 
 	can_do_direct = direct_io;
-	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+	ret = ocfs2_prepare_inode_for_write(file, ppos,
 					    iocb->ki_left, appending,
 					    &can_do_direct, &has_refcount);
 	if (ret < 0) {
@@ -2385,7 +2388,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
 {
 	int ret;
 
-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry,	&sd->pos,
+	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
 					    sd->total_len, 0, NULL, NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
-- 
cgit v1.2.3


From 155027121fe52f9b4f25e9d156c22f2f5012e5fe Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 12 Aug 2010 10:36:38 +0800
Subject: ocfs2: Add struct file to ocfs2_refcount_cow.

Add a new parameter 'struct file *' to ocfs2_refcount_cow
so that we can add readahead support later.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/aops.c         | 2 +-
 fs/ocfs2/file.c         | 8 ++++----
 fs/ocfs2/refcounttree.c | 4 +++-
 fs/ocfs2/refcounttree.h | 3 ++-
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e3efdd93773f..7155c5a919d7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1693,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 		mlog_errno(ret);
 		goto out;
 	} else if (ret == 1) {
-		ret = ocfs2_refcount_cow(inode, di_bh,
+		ret = ocfs2_refcount_cow(inode, filp, di_bh,
 					 wc->w_cpos, wc->w_clen, UINT_MAX);
 		if (ret) {
 			mlog_errno(ret);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5920159a421f..4331f57e9fde 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -361,7 +361,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
 	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 		goto out;
 
-	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 
 out:
 	return status;
@@ -904,8 +904,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
 		zero_clusters = last_cpos - zero_cpos;
 
 	if (needs_cow) {
-		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
-					UINT_MAX);
+		rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+					zero_clusters, UINT_MAX);
 		if (rc) {
 			mlog_errno(rc);
 			goto out;
@@ -2071,7 +2071,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 
 	*meta_level = 1;
 
-	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
 	if (ret)
 		mlog_errno(ret);
 out:
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa733e9c..66dab3c0d495 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3404,6 +3404,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
  * unrefcounted extent.
  */
 static int ocfs2_refcount_cow_hunk(struct inode *inode,
+				   struct file *file,
 				   struct buffer_head *di_bh,
 				   u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3481,6 +3482,7 @@ out:
  * clusters between cpos and cpos+write_len are safe to modify.
  */
 int ocfs2_refcount_cow(struct inode *inode,
+		       struct file *file,
 		       struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3500,7 +3502,7 @@ int ocfs2_refcount_cow(struct inode *inode,
 			num_clusters = write_len;
 
 		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-			ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+			ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
 						      num_clusters, max_cpos);
 			if (ret) {
 				mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..29cba0eaa927 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  u32 clusters,
 					  int *credits,
 					  int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct file *filep, struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len, u32 max_cpos);
 
 typedef int (ocfs2_post_refcount_func)(struct inode *inode,
-- 
cgit v1.2.3


From 7b61cf54a2445ad21df9dd44f0c8bb90154ddea8 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 17 Jun 2010 14:14:36 +0800
Subject: ocfs2: Add readahead support for CoW.

Add a new function ocfs2_readahead_for_cow so that
we start readahead before we start our CoW.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 66dab3c0d495..ad08c1134baa 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3398,6 +3398,28 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
 	return ret;
 }
 
+static void ocfs2_readahead_for_cow(struct inode *inode,
+				    struct file *file,
+				    u32 start, u32 len)
+{
+	struct address_space *mapping;
+	pgoff_t index;
+	unsigned long num_pages;
+	int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+	if (!file)
+		return;
+
+	mapping = file->f_mapping;
+	num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+	if (!num_pages)
+		num_pages = 1;
+
+	index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+	page_cache_sync_readahead(mapping, &file->f_ra, file,
+				  index, num_pages);
+}
+
 /*
  * Starting at cpos, try to CoW write_len clusters.  Don't CoW
  * past max_cpos.  This will stop when it runs into a hole or an
@@ -3433,6 +3455,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 
 	BUG_ON(cow_len == 0);
 
+	ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
 	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
 	if (!context) {
 		ret = -ENOMEM;
-- 
cgit v1.2.3


From 6ea4843f53282465f2bdbe5eedde7d8c3081dfdf Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 12 Aug 2010 10:31:34 +0800
Subject: ocfs2: Add readhead during CoW.

In CoW, when we meet with a readahead page, we know
it is time to move the readahead window. So carry
out a new readahead.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ad08c1134baa..47549f64224c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
 
 struct ocfs2_cow_context {
 	struct inode *inode;
+	struct file *file;
 	u32 cow_start;
 	u32 cow_len;
 	struct ocfs2_extent_tree data_et;
@@ -2922,13 +2923,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
 	struct page *page;
 	pgoff_t page_index;
-	unsigned int from, to;
+	unsigned int from, to, readahead_pages;
 	loff_t offset, end, map_end;
 	struct address_space *mapping = context->inode->i_mapping;
 
 	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
 	     new_cluster, new_len, cpos);
 
+	readahead_pages =
+		(ocfs2_cow_contig_clusters(sb) <<
+		 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
 	/*
@@ -2959,6 +2963,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
 			BUG_ON(PageDirty(page));
 
+		if (PageReadahead(page) && context->file) {
+			page_cache_async_readahead(mapping,
+						   &context->file->f_ra,
+						   context->file,
+						   page, page_index,
+						   readahead_pages);
+		}
+
 		if (!PageUptodate(page)) {
 			ret = block_read_full_page(page, ocfs2_get_block);
 			if (ret) {
@@ -3478,6 +3490,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 	context->ref_root_bh = ref_root_bh;
 	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
 	context->get_clusters = ocfs2_di_get_clusters;
+	context->file = file;
 
 	ocfs2_init_dinode_extent_tree(&context->data_et,
 				      INODE_CACHE(inode), di_bh);
-- 
cgit v1.2.3


From 5ffef88ffeb730e1bf2da56a39a55e03d57a66c9 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 4 Aug 2010 10:14:47 +0300
Subject: UBIFS: switch to RO mode after synchronizing

In 'ubifs_garbage_collect()' on error path, we first switch to R/O mode, and
then synchronize write-buffers (to make sure no data are lost). But the GC
write-buffer synchronization will fail, because we are already in R/O mode.
This patch re-orders this and makes sure we first synchronize the write-buffer,
and then switch to R/O mode.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca05..f89a422ca395 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -774,8 +774,8 @@ out_unlock:
 out:
 	ubifs_assert(ret < 0);
 	ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
-	ubifs_ro_mode(c, ret);
 	ubifs_wbuf_sync_nolock(wbuf);
+	ubifs_ro_mode(c, ret);
 	mutex_unlock(&wbuf->io_mutex);
 	ubifs_return_leb(c, lp.lnum);
 	return ret;
-- 
cgit v1.2.3


From efe1881f5482f94f5e5e6cb74bf3ea72f2b5b9ce Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 4 Aug 2010 14:06:06 +0300
Subject: UBIFS: do not treat ENOSPC specially

'ubifs_garbage_collect_leb()' should never return '-ENOSPC', and if it
does, this is an error. Thus, do not treat this error code specially.
'-EAGAIN' is a special error code, but not '-ENOSPC'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f89a422ca395..ee4b05de4d48 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -677,14 +677,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 
 		ret = ubifs_garbage_collect_leb(c, &lp);
 		if (ret < 0) {
-			if (ret == -EAGAIN || ret == -ENOSPC) {
+			if (ret == -EAGAIN) {
 				/*
-				 * These codes are not errors, so we have to
-				 * return the LEB to lprops. But if the
-				 * 'ubifs_return_leb()' function fails, its
-				 * failure code is propagated to the caller
-				 * instead of the original '-EAGAIN' or
-				 * '-ENOSPC'.
+				 * This is not error, so we have to return the
+				 * LEB to lprops. But if 'ubifs_return_leb()'
+				 * fails, its failure code is propagated to the
+				 * caller instead of the original '-EAGAIN'.
 				 */
 				err = ubifs_return_leb(c, lp.lnum);
 				if (err)
-- 
cgit v1.2.3


From e3408ad4cbed6ec6990efad4c2ef0856bcd3c712 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 4 Aug 2010 13:50:51 +0300
Subject: UBIFS: fix assertion warning

This patch fixes the following false assertion warning:

UBIFS assert failed in data_nodes_cmp at 130 (pid 15107)

The assertion was wrong because it did not take into account that the
node can be an xentry.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index ee4b05de4d48..98b2c3c8725a 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -178,7 +178,8 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	if (typeb == UBIFS_INO_KEY)
 		return 1;
 
-	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+	ubifs_assert(typea == UBIFS_DENT_KEY || typea == UBIFS_XENT_KEY);
+	ubifs_assert(typeb == UBIFS_DENT_KEY || typeb == UBIFS_XENT_KEY);
 	inuma = key_inum(c, &sa->key);
 	inumb = key_inum(c, &sb->key);
 
-- 
cgit v1.2.3


From 44ec83b8bd05d323998031f141c310127721acae Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 7 Aug 2010 08:44:13 +0300
Subject: UBIFS: do not look up truncation nodes

When moving nodes in GC, do not try to look up truncation nodes in TNC,
because they do not exist there. This would be harmless, because the TNC
look-up would fail, if we did not have bug 'ubifs_add_snod()' which reads
garbage into 'snod->key'. But in any case, it is less error prone to
explicitly ignore everything but inode, data, dentry and xentry nodes.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 98b2c3c8725a..9dbbc5c88940 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -233,9 +233,26 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
 		int err;
 
-		ubifs_assert(snod->type != UBIFS_IDX_NODE);
-		ubifs_assert(snod->type != UBIFS_REF_NODE);
-		ubifs_assert(snod->type != UBIFS_CS_NODE);
+		ubifs_assert(snod->type == UBIFS_INO_NODE  ||
+			     snod->type == UBIFS_DATA_NODE ||
+			     snod->type == UBIFS_DENT_NODE ||
+			     snod->type == UBIFS_XENT_NODE ||
+			     snod->type == UBIFS_TRUN_NODE);
+
+		if (snod->type != UBIFS_INO_NODE  &&
+		    snod->type != UBIFS_DATA_NODE &&
+		    snod->type != UBIFS_DENT_NODE &&
+		    snod->type != UBIFS_XENT_NODE) {
+			/* Probably truncation node, zap it */
+			list_del(&snod->list);
+			kfree(snod);
+			continue;
+		}
+
+		ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
+			     key_type(c, &snod->key) == UBIFS_INO_KEY  ||
+			     key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+			     key_type(c, &snod->key) == UBIFS_XENT_KEY);
 
 		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
 					 snod->offs, 0);
-- 
cgit v1.2.3


From ab87118d717467cbcd9648692c2a9708d55193bc Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 8 Aug 2010 12:25:33 +0300
Subject: UBIFS: do not use key type in list_sort

In comparison function for 'list_sort()' we use key type to distinguish between
node types. However, we have a bit simper way to detect node type -
'snod->type'. This more logical to use, comparing to decoding key types. Also
allows to get rid of 2 local variables.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9dbbc5c88940..27815bb91125 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -157,7 +157,6 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
  */
 int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-	int typea, typeb;
 	ino_t inuma, inumb;
 	struct ubifs_info *c = priv;
 	struct ubifs_scan_node *sa, *sb;
@@ -165,21 +164,22 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	cond_resched();
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
-	typea = key_type(c, &sa->key);
-	typeb = key_type(c, &sb->key);
-	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+	ubifs_assert(sa->type != UBIFS_DATA_NODE &&
+		     sb->type != UBIFS_DATA_NODE);
 
 	/* Inodes go before directory entries */
-	if (typea == UBIFS_INO_KEY) {
-		if (typeb == UBIFS_INO_KEY)
+	if (sa->type == UBIFS_INO_NODE) {
+		if (sb->type == UBIFS_INO_NODE)
 			return sb->len - sa->len;
 		return -1;
 	}
-	if (typeb == UBIFS_INO_KEY)
+	if (sb->type == UBIFS_INO_NODE)
 		return 1;
 
-	ubifs_assert(typea == UBIFS_DENT_KEY || typea == UBIFS_XENT_KEY);
-	ubifs_assert(typeb == UBIFS_DENT_KEY || typeb == UBIFS_XENT_KEY);
+	ubifs_assert(sa->type == UBIFS_DENT_NODE ||
+		     sa->type == UBIFS_XENT_NODE);
+	ubifs_assert(sb->type == UBIFS_DENT_NODE ||
+		     sb->type == UBIFS_XENT_NODE);
 	inuma = key_inum(c, &sa->key);
 	inumb = key_inum(c, &sb->key);
 
-- 
cgit v1.2.3


From 66576833f5396af34c52160b16d7b8573199282a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 8 Aug 2010 12:29:58 +0300
Subject: UBIFS: improve assertion in node comparison functions

Improve assertions in gc.c in the comparison functions for 'list_sort()': check
key types _and_ node types.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 27815bb91125..dafef3d195d3 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -127,8 +127,11 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	cond_resched();
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
+
 	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
 	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+	ubifs_assert(sa->type == UBIFS_DATA_NODE);
+	ubifs_assert(sb->type == UBIFS_DATA_NODE);
 
 	inuma = key_inum(c, &sa->key);
 	inumb = key_inum(c, &sb->key);
@@ -164,6 +167,9 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	cond_resched();
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
+
+	ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
+		     key_type(c, &sb->key) != UBIFS_DATA_KEY);
 	ubifs_assert(sa->type != UBIFS_DATA_NODE &&
 		     sb->type != UBIFS_DATA_NODE);
 
@@ -176,10 +182,15 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	if (sb->type == UBIFS_INO_NODE)
 		return 1;
 
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sa->key) == UBIFS_XENT_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sb->key) == UBIFS_XENT_KEY);
 	ubifs_assert(sa->type == UBIFS_DENT_NODE ||
 		     sa->type == UBIFS_XENT_NODE);
 	ubifs_assert(sb->type == UBIFS_DENT_NODE ||
 		     sb->type == UBIFS_XENT_NODE);
+
 	inuma = key_inum(c, &sa->key);
 	inumb = key_inum(c, &sb->key);
 
-- 
cgit v1.2.3


From 5b7a3a2e1b0cbc7d5410a8da60dac266a3e19268 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 8 Aug 2010 12:32:00 +0300
Subject: UBIFS: do not write rubbish into truncation scanning node

In the scanning code, in 'ubifs_add_snod()', we write rubbish into
'snod->key', because we assume that on-flash truncation nodes have a key, but
they do not. If the other parts of UBIFS then mistakenly try to look-up
the truncation node key (they should not do this, but may do because of a bug),
we can succeed and corrupt TNC. It looks like we did have such a situation in
'sort_nodes()' in gc.c.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/scan.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c525384191..a0a305ca61af 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -212,7 +212,6 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	case UBIFS_DENT_NODE:
 	case UBIFS_XENT_NODE:
 	case UBIFS_DATA_NODE:
-	case UBIFS_TRUN_NODE:
 		/*
 		 * The key is in the same place in all keyed
 		 * nodes.
-- 
cgit v1.2.3


From ba2f48f70efcf4d82deafb2be327ed64b1f043a5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 22 Aug 2010 07:10:12 +0300
Subject: UBIFS: mark unused key objects as invalid

When scanning the flash, UBIFS builds a list of flash nodes of type
'struct ubifs_scan_node'. Each scanned node has a 'snod->key' field. This field
is valid for most of the nodes, but invalid for some node type, e.g., truncation
nodes. It is safer to explicitly initialize such keys to something invalid,
rather than leaving them initialized to all zeros, which has key type of
UBIFS_INO_KEY.

This patch introduces new "fake" key type UBIFS_INVALID_KEY and initializes
unused 'snod->key' objects to this type. It also adds debugging assertions in
the TNC code to make sure no one ever tries to look these nodes up in the TNC.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/key.h   | 14 ++++++++++++++
 fs/ubifs/scan.c  |  5 ++++-
 fs/ubifs/tnc.c   |  5 ++++-
 fs/ubifs/ubifs.h |  6 +++++-
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0b..92a8491a8f8c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -305,6 +305,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
 	key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
 }
 
+/**
+ * invalid_key_init - initialize invalid node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ *
+ * This is a helper function which marks a @key object as invalid.
+ */
+static inline void invalid_key_init(const struct ubifs_info *c,
+				    union ubifs_key *key)
+{
+	key->u32[0] = 0xDEADBEAF;
+	key->u32[1] = UBIFS_INVALID_KEY;
+}
+
 /**
  * key_type - get key type.
  * @c: UBIFS file-system description object
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index a0a305ca61af..3e1ee57dbeaa 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	struct ubifs_ino_node *ino = buf;
 	struct ubifs_scan_node *snod;
 
-	snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+	snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
 	if (!snod)
 		return -ENOMEM;
 
@@ -218,6 +218,9 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		 */
 		key_read(c, &ino->key, &snod->key);
 		break;
+	default:
+		invalid_key_init(c, &snod->key);
+		break;
 	}
 	list_add_tail(&snod->list, &sleb->nodes);
 	sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e5..ad9cf0133622 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 	unsigned long time = get_seconds();
 
 	dbg_tnc("search key %s", DBGKEY(key));
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
 
 	znode = c->zroot.znode;
 	if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
  *
  * This function searches an indexing node by its first key @key and its
  * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
- * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * nodes it traverses to TNC. This function is called for indexing nodes which
  * were found on the media by scanning, for example when garbage-collecting or
  * when doing in-the-gaps commit. This means that the indexing node which is
  * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
 	struct ubifs_znode *znode, *zn;
 	int n, nn;
 
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
+
 	/*
 	 * The arguments have probably been read off flash, so don't assume
 	 * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0c9876b396dd..c4dc9b18f73e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
  * in TNC. However, when replaying, it is handy to introduce fake "truncation"
  * keys for truncation nodes because the code becomes simpler. So we define
  * %UBIFS_TRUN_KEY type.
+ *
+ * But otherwise, out of the journal reply scope, the truncation keys are
+ * invalid.
  */
-#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+#define UBIFS_TRUN_KEY    UBIFS_KEY_TYPES_CNT
+#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
 
 /*
  * How much a directory entry/extended attribute entry adds to the parent/host
-- 
cgit v1.2.3


From 1a9476a77083354005750c9df45ba9d71ad12c8c Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 8 Aug 2010 12:45:23 +0300
Subject: UBIFS: fix assertion warnings in comparison function

When running the integrity test ('integck' from mtd-utils) on current
UBIFS on 2.6.35, I see that assertions in UBIFS 'list_sort()' comparison
functions trigger sometimes, e.g.:

UBIFS assert failed in data_nodes_cmp at 132 (pid 28311)

My investigation showed that this happens when 'list_sort()' calls the 'cmp()'
function with equivalent arguments. In this case, the 'struct list_head'
parameter, passed to 'cmp()' is bogus, and it does not belong to any element in
the original list.

And this issue seems to be introduced by commit:

commit 835cc0c8477fdbc59e0217891d6f11061b1ac4e2
Author: Don Mullis <don.mullis@gmail.com>
Date:   Fri Mar 5 13:43:15 2010 -0800

It is easy to work around the issue by doing:

if (a == b)
	return 0;

in UBIFS. It works, but 'lib_sort()' should nevertheless be fixed. Although it
is harmless to have this piece of code in UBIFS.

This patch adds that code to both UBIFS 'cmp()' functions:
'data_nodes_cmp()' and 'nondata_nodes_cmp()'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index dafef3d195d3..4fc31ca0e3b1 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,6 +125,9 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	struct ubifs_scan_node *sa, *sb;
 
 	cond_resched();
+	if (a == b)
+		return 0;
+
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
 
@@ -165,6 +168,9 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	struct ubifs_scan_node *sa, *sb;
 
 	cond_resched();
+	if (a == b)
+		return 0;
+
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
 
-- 
cgit v1.2.3


From 3bb66b47a4268a4419594b4c4aec58dbeb6b58d2 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 7 Aug 2010 10:06:11 +0300
Subject: UBIFS: introduce list sorting debugging checks

The UBIFS bug in the GC list sorting comparison functions inspired
me to write internal debugging check functions which verify that
the list of nodes is sorted properly.

So, this patch implements 2 new debugging functions:
 o 'dbg_check_data_nodes_order()' - check order of data nodes list
 o 'dbg_check_nondata_nodes_order()' - check order of non-data nodes list

The debugging functions are executed only if general UBIFS debugging checks are
enabled. And they are compiled out if UBIFS debugging is disabled.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/debug.h |   4 ++
 fs/ubifs/gc.c    |  10 +++-
 3 files changed, 168 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..c664f10aec6c 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
 	return err;
 }
 
+/**
+ * dbg_check_data_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t blka, blkb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_DATA_NODE) {
+			ubifs_err("bad node type %d", sa->type);
+			dbg_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sb->type != UBIFS_DATA_NODE) {
+			ubifs_err("bad node type %d", sb->type);
+			dbg_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err("larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		blka = key_block(c, &sa->key);
+		blkb = key_block(c, &sb->key);
+
+		if (blka > blkb) {
+			ubifs_err("larger block %u goes before %u", blka, blkb);
+			goto error_dump;
+		}
+		if (blka == blkb) {
+			ubifs_err("two data nodes for the same block");
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	dbg_dump_node(c, sa->node);
+	dbg_dump_node(c, sb->node);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of non-data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t hasha, hashb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err("bad node type %d", sa->type);
+			dbg_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err("bad node type %d", sb->type);
+			dbg_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			ubifs_err("non-inode node goes before inode node");
+			goto error_dump;
+		}
+
+		if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
+			continue;
+
+		if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			/* Inode nodes are sorted in descending size order */
+			if (sa->len < sb->len) {
+				ubifs_err("smaller inode node goes first");
+				goto error_dump;
+			}
+			continue;
+		}
+
+		/*
+		 * This is either a dentry or xentry, which should be sorted in
+		 * ascending (parent ino, hash) order.
+		 */
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err("larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		hasha = key_block(c, &sa->key);
+		hashb = key_block(c, &sb->key);
+
+		if (hasha > hashb) {
+			ubifs_err("larger hash %u goes before %u", hasha, hashb);
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	ubifs_msg("dumping first node");
+	dbg_dump_node(c, sa->node);
+	ubifs_msg("dumping second node");
+	dbg_dump_node(c, sb->node);
+	return -EINVAL;
+	return 0;
+}
+
 static int invocation_cnt;
 
 int dbg_force_in_the_gaps(void)
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea6..69ebe4729151 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
 			int row, int col);
 int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
 			 loff_t size);
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 
 /* Force the use of in-the-gaps method for testing */
 
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_check_inode_size(c, inode, size)       0
+#define dbg_check_data_nodes_order(c, head)        0
+#define dbg_check_nondata_nodes_order(c, head)     0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 4fc31ca0e3b1..396f24a30af9 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -242,14 +242,13 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		      struct list_head *nondata, int *min)
 {
+	int err;
 	struct ubifs_scan_node *snod, *tmp;
 
 	*min = INT_MAX;
 
 	/* Separate data nodes and non-data nodes */
 	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		int err;
-
 		ubifs_assert(snod->type == UBIFS_INO_NODE  ||
 			     snod->type == UBIFS_DATA_NODE ||
 			     snod->type == UBIFS_DENT_NODE ||
@@ -293,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	/* Sort data and non-data nodes */
 	list_sort(c, &sleb->nodes, &data_nodes_cmp);
 	list_sort(c, nondata, &nondata_nodes_cmp);
+
+	err = dbg_check_data_nodes_order(c, &sleb->nodes);
+	if (err)
+		return err;
+	err = dbg_check_nondata_nodes_order(c, nondata);
+	if (err)
+		return err;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3a8fa0ed3e1a8dc54f9297d2a2292b87e03e21b5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 22 Aug 2010 21:27:30 +0300
Subject: UBIFS: improve error reporting when reading bad node

When an error happens during validation of read node, the typical situation is that
the LEB we read is unmapped (due to some bug). It is handy to include the mapping
status into the error message.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30bb..9432431bf595 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -815,7 +815,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 	return 0;
 
 out:
-	ubifs_err("bad node at LEB %d:%d", lnum, offs);
+	ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
+		  ubi_is_mapped(c->ubi, lnum));
 	dbg_dump_node(c, buf);
 	dbg_dump_stack();
 	return -EINVAL;
-- 
cgit v1.2.3


From 314dd2a05340fffbe217c5e40ec6c3bd1d07bf89 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 3 Sep 2010 10:07:48 -0500
Subject: dlm: Don't send callback to node making lock request when "try 1cb"
 fails

When converting a lock, an lkb is in the granted state and also being used
to request a new state. In the case that the conversion was a "try 1cb"
type which has failed, and if the new state was incompatible with the old
state, a callback was being generated to the requesting node. This is
incorrect as callbacks should only be sent to all the other nodes holding
blocking locks. The requesting node should receive the normal (failed)
response to its "try 1cb" conversion request only.

This was discovered while debugging a performance problem on GFS2, however
this fix also speeds up GFS as well. In the GFS2 case the performance gain
is over 10x for cases of write activity to an inode whose glock is cached
on another, idle (wrt that glock) node.

(comment added, dct)

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Tested-by: Abhijith Das <adas@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15ca..64e5f3efdd81 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
 	struct dlm_lkb *gr;
 
 	list_for_each_entry(gr, head, lkb_statequeue) {
+		/* skip self when sending basts to convertqueue */
+		if (gr == lkb)
+			continue;
 		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
 			queue_bast(r, gr, lkb->lkb_rqmode);
 			gr->lkb_highbast = lkb->lkb_rqmode;
-- 
cgit v1.2.3


From 0e54c8992a35ab8126e8a7661007ddc1745a2597 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Sun, 5 Sep 2010 22:33:00 +0400
Subject: UBIFS: check return code of ubifs_lpt_lookup

Function ubifs_lpt_lookup may return ERR_PTR(...). Check for it.

[Tweaked by Artem Bityutskiy]

Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 0084a33c4c69..72775d35b99e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
 		goto out;
 	for (i = 0; i < c->lsave_cnt; i++) {
 		int lnum = c->lsave[i];
+		struct ubifs_lprops *lprops;
 
 		/*
 		 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
 		 */
 		if (lnum >= c->leb_cnt)
 			continue;
-		ubifs_lpt_lookup(c, lnum);
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
 	}
 out:
 	vfree(buf);
-- 
cgit v1.2.3


From 8c893a5545ca772744376295690723dcb0b47d96 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Sun, 5 Sep 2010 22:33:04 +0400
Subject: UBIFS: check return code of pnode_lookup

Function pnode_lookup may return ERR_PTR(...). Check for it.

Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt_commit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d12535b7fc78..5c90dec5db0b 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
 	struct ubifs_pnode *pnode;
 
 	pnode = pnode_lookup(c, 0);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+
 	while (pnode) {
 		do_make_pnode_dirty(c, pnode);
 		pnode = next_pnode_to_dirty(c, pnode);
-- 
cgit v1.2.3


From 1132b26029918aa8fb5ba24a81b5c234e61f356c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 13 Aug 2010 17:31:16 -0400
Subject: nfsd: remove duplicate NFS4_STATEID_SIZE declaration

Use NFS4_STATEID_SIZE from include/linux/nfs4

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..014482c4e57d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 
 /* Index of predefined Linux callback client operations */
 
-- 
cgit v1.2.3


From 17cebf658e088935d4bdebfc7ad9800e9fc4a0b2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Aug 2010 16:55:22 +1000
Subject: sunrpc: extract some common sunrpc_cache code from nfsd

Rather can duplicating this idiom twice, put it in an inline function.
This reduces the usage of 'expiry_time' out side the sunrpc/cache.c
code and thus the impact of a change that is about to be made to that
field.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c             | 9 +++------
 include/linux/sunrpc/cache.h | 6 ++++++
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..e56827b88fd2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -935,10 +935,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
 
 	ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
 	if (!IS_ERR(ek)) {
-		ek->h.expiry_time = get_seconds()-1;
+		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
 		cache_put(&ek->h, &svc_expkey_cache);
 	}
-	svc_expkey_cache.nextcheck = get_seconds();
 }
 
 static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +972,9 @@ static void exp_unhash(struct svc_export *exp)
 
 	ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
 	if (!IS_ERR(ek)) {
-		ek->h.expiry_time = get_seconds()-1;
+		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
 		cache_put(&ek->h, &svc_expkey_cache);
 	}
-	svc_expkey_cache.nextcheck = get_seconds();
 }
 	
 /*
@@ -1097,8 +1095,7 @@ out:
 static void
 exp_do_unexport(svc_export *unexp)
 {
-	unexp->h.expiry_time = get_seconds()-1;
-	svc_export_cache.nextcheck = get_seconds();
+	sunrpc_invalidate(&unexp->h, &svc_export_cache);
 	exp_unhash(unexp);
 	exp_fsid_unhash(unexp);
 }
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 7bf3e84b92f4..0e1febf4e5bc 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -228,4 +228,10 @@ static inline time_t get_expiry(char **bpp)
 	return rv;
 }
 
+static inline void sunrpc_invalidate(struct cache_head *h,
+				     struct cache_detail *detail)
+{
+	h->expiry_time = get_seconds() - 1;
+	detail->nextcheck = get_seconds();
+}
 #endif /*  _LINUX_SUNRPC_CACHE_H_ */
-- 
cgit v1.2.3


From c5b29f885afe890f953f7f23424045cdad31d3e4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Aug 2010 16:55:22 +1000
Subject: sunrpc: use seconds since boot in expiry cache

This protects us from confusion when the wallclock time changes.

We convert to and from wallclock when  setting or reading expiry
times.

Also use seconds since boot for last_clost time.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/dns_resolve.c         |  6 +++---
 fs/nfsd/nfs4idmap.c          |  2 +-
 include/linux/sunrpc/cache.h | 28 +++++++++++++++++++++++++---
 net/sunrpc/cache.c           | 36 +++++++++++++++++++-----------------
 4 files changed, 48 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
 		return 0;
 	}
 	item = container_of(h, struct nfs_dns_ent, h);
-	ttl = (long)item->h.expiry_time - (long)get_seconds();
+	ttl = item->h.expiry_time - seconds_since_boot();
 	if (ttl < 0)
 		ttl = 0;
 
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
 	ttl = get_expiry(&buf);
 	if (ttl == 0)
 		goto out;
-	key.h.expiry_time = ttl + get_seconds();
+	key.h.expiry_time = ttl + seconds_since_boot();
 
 	ret = -ENOMEM;
 	item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
 		goto out_err;
 	ret = -ETIMEDOUT;
 	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-			|| (*item)->h.expiry_time < get_seconds()
+			|| (*item)->h.expiry_time < seconds_since_boot()
 			|| cd->flush_time > (*item)->h.last_refresh)
 		goto out_put;
 	ret = -ENOENT;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..808b33a4a090 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -550,7 +550,7 @@ do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
 		goto out_err;
 	ret = -ETIMEDOUT;
 	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-			|| (*item)->h.expiry_time < get_seconds()
+			|| (*item)->h.expiry_time < seconds_since_boot()
 			|| detail->flush_time > (*item)->h.last_refresh)
 		goto out_put;
 	ret = -ENOENT;
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 0e1febf4e5bc..ece432b7f87f 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -218,20 +218,42 @@ static inline int get_int(char **bpp, int *anint)
 	return 0;
 }
 
+/*
+ * timestamps kept in the cache are expressed in seconds
+ * since boot.  This is the best for measuring differences in
+ * real time.
+ */
+static inline time_t seconds_since_boot(void)
+{
+	struct timespec boot;
+	getboottime(&boot);
+	return get_seconds() - boot.tv_sec;
+}
+
+static inline time_t convert_to_wallclock(time_t sinceboot)
+{
+	struct timespec boot;
+	getboottime(&boot);
+	return boot.tv_sec + sinceboot;
+}
+
 static inline time_t get_expiry(char **bpp)
 {
 	int rv;
+	struct timespec boot;
+
 	if (get_int(bpp, &rv))
 		return 0;
 	if (rv < 0)
 		return 0;
-	return rv;
+	getboottime(&boot);
+	return rv - boot.tv_sec;
 }
 
 static inline void sunrpc_invalidate(struct cache_head *h,
 				     struct cache_detail *detail)
 {
-	h->expiry_time = get_seconds() - 1;
-	detail->nextcheck = get_seconds();
+	h->expiry_time = seconds_since_boot() - 1;
+	detail->nextcheck = seconds_since_boot();
 }
 #endif /*  _LINUX_SUNRPC_CACHE_H_ */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2b06410e584e..8dc121955fdc 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -42,7 +42,7 @@ static void cache_revisit_request(struct cache_head *item);
 
 static void cache_init(struct cache_head *h)
 {
-	time_t now = get_seconds();
+	time_t now = seconds_since_boot();
 	h->next = NULL;
 	h->flags = 0;
 	kref_init(&h->ref);
@@ -52,7 +52,7 @@ static void cache_init(struct cache_head *h)
 
 static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
-	return  (h->expiry_time < get_seconds()) ||
+	return  (h->expiry_time < seconds_since_boot()) ||
 		(detail->flush_time > h->last_refresh);
 }
 
@@ -127,7 +127,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
 static void cache_fresh_locked(struct cache_head *head, time_t expiry)
 {
 	head->expiry_time = expiry;
-	head->last_refresh = get_seconds();
+	head->last_refresh = seconds_since_boot();
 	set_bit(CACHE_VALID, &head->flags);
 }
 
@@ -238,7 +238,7 @@ int cache_check(struct cache_detail *detail,
 
 	/* now see if we want to start an upcall */
 	refresh_age = (h->expiry_time - h->last_refresh);
-	age = get_seconds() - h->last_refresh;
+	age = seconds_since_boot() - h->last_refresh;
 
 	if (rqstp == NULL) {
 		if (rv == -EAGAIN)
@@ -253,7 +253,7 @@ int cache_check(struct cache_detail *detail,
 				cache_revisit_request(h);
 				if (rv == -EAGAIN) {
 					set_bit(CACHE_NEGATIVE, &h->flags);
-					cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY);
+					cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
 					cache_fresh_unlocked(h, detail);
 					rv = -ENOENT;
 				}
@@ -388,11 +388,11 @@ static int cache_clean(void)
 			return -1;
 		}
 		current_detail = list_entry(next, struct cache_detail, others);
-		if (current_detail->nextcheck > get_seconds())
+		if (current_detail->nextcheck > seconds_since_boot())
 			current_index = current_detail->hash_size;
 		else {
 			current_index = 0;
-			current_detail->nextcheck = get_seconds()+30*60;
+			current_detail->nextcheck = seconds_since_boot()+30*60;
 		}
 	}
 
@@ -477,7 +477,7 @@ EXPORT_SYMBOL_GPL(cache_flush);
 void cache_purge(struct cache_detail *detail)
 {
 	detail->flush_time = LONG_MAX;
-	detail->nextcheck = get_seconds();
+	detail->nextcheck = seconds_since_boot();
 	cache_flush();
 	detail->flush_time = 1;
 }
@@ -902,7 +902,7 @@ static int cache_release(struct inode *inode, struct file *filp,
 		filp->private_data = NULL;
 		kfree(rp);
 
-		cd->last_close = get_seconds();
+		cd->last_close = seconds_since_boot();
 		atomic_dec(&cd->readers);
 	}
 	module_put(cd->owner);
@@ -1034,7 +1034,7 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
 	int len;
 
 	if (atomic_read(&detail->readers) == 0 &&
-	    detail->last_close < get_seconds() - 30) {
+	    detail->last_close < seconds_since_boot() - 30) {
 			warn_no_listener(detail);
 			return -EINVAL;
 	}
@@ -1219,7 +1219,8 @@ static int c_show(struct seq_file *m, void *p)
 
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
-			   cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags);
+			   convert_to_wallclock(cp->expiry_time),
+			   atomic_read(&cp->ref.refcount), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
@@ -1285,7 +1286,7 @@ static ssize_t read_flush(struct file *file, char __user *buf,
 	unsigned long p = *ppos;
 	size_t len;
 
-	sprintf(tbuf, "%lu\n", cd->flush_time);
+	sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time));
 	len = strlen(tbuf);
 	if (p >= len)
 		return 0;
@@ -1303,19 +1304,20 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 			   struct cache_detail *cd)
 {
 	char tbuf[20];
-	char *ep;
-	long flushtime;
+	char *bp, *ep;
+
 	if (*ppos || count > sizeof(tbuf)-1)
 		return -EINVAL;
 	if (copy_from_user(tbuf, buf, count))
 		return -EFAULT;
 	tbuf[count] = 0;
-	flushtime = simple_strtoul(tbuf, &ep, 0);
+	simple_strtoul(tbuf, &ep, 0);
 	if (*ep && *ep != '\n')
 		return -EINVAL;
 
-	cd->flush_time = flushtime;
-	cd->nextcheck = get_seconds();
+	bp = tbuf;
+	cd->flush_time = get_expiry(&bp);
+	cd->nextcheck = seconds_since_boot();
 	cache_flush();
 
 	*ppos += count;
-- 
cgit v1.2.3


From 2cf6d26a354ab6362e301b5a323832b02867df47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:10 -0400
Subject: block: pass gfp_mask and flags to sb_issue_discard

We'll need to get rid of the BLKDEV_IFL_BARRIER flag, and to facilitate
that and to make the interface less confusing pass all flags explicitly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ext4/mballoc.c      |  3 ++-
 fs/fat/fatent.c        |  4 +++-
 include/linux/blkdev.h | 11 +++++------
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..df44b345f662 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,7 +2566,8 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
+			       BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..3a56a82f5658 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,9 @@ int fat_free_clusters(struct inode *inode, int cluster)
 
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
-					nr_clus * sbi->sec_per_clus);
+					nr_clus * sbi->sec_per_clus,
+					GFP_NOFS,
+					BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 
 				first_cl = cluster;
 			}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ef705f800ab..6b305eb4a343 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -881,13 +881,12 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
-static inline int sb_issue_discard(struct super_block *sb,
-				   sector_t block, sector_t nr_blocks)
+static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
-	block <<= (sb->s_blocksize_bits - 9);
-	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
-				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+	return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+				    nr_blocks << (sb->s_blocksize_bits - 9),
+				    gfp_mask, flags);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.2.3


From 80f6c29d8a758650d5c9eac05074b4b3e8c266df Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:11 -0400
Subject: xfs: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes and remove the EOPNOTSUPP
detection for barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/xfs/linux-2.6/xfs_buf.c   | 16 ++--------------
 fs/xfs/linux-2.6/xfs_buf.h   | 11 +----------
 fs/xfs/linux-2.6/xfs_trace.h |  1 -
 fs/xfs/xfs_log.c             | 13 -------------
 4 files changed, 3 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ea79072f5210..b93ea3342281 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -929,19 +929,7 @@ xfs_buf_iodone_work(
 	xfs_buf_t		*bp =
 		container_of(work, xfs_buf_t, b_iodone_work);
 
-	/*
-	 * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-	 * ordered flag and reissue them.  Because we can't tell the higher
-	 * layers directly that they should not issue ordered I/O anymore, they
-	 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-	 */
-	if ((bp->b_error == EOPNOTSUPP) &&
-	    (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-		trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-		bp->b_flags &= ~XBF_ORDERED;
-		bp->b_flags |= _XFS_BARRIER_FAILED;
-		xfs_buf_iorequest(bp);
-	} else if (bp->b_iodone)
+	if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
 	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
@@ -1200,7 +1188,7 @@ _xfs_buf_ioapply(
 
 	if (bp->b_flags & XBF_ORDERED) {
 		ASSERT(!(bp->b_flags & XBF_READ));
-		rw = WRITE_BARRIER;
+		rw = WRITE_FLUSH_FUA;
 	} else if (bp->b_flags & XBF_LOG_BUFFER) {
 		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
 		bp->b_flags &= ~_XBF_RUN_QUEUES;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d072e5ff923b..d533d64e2c3e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -86,14 +86,6 @@ typedef enum {
  */
 #define _XBF_PAGE_LOCKED	(1 << 22)
 
-/*
- * If we try a barrier write, but it fails we have to communicate
- * this to the upper layers.  Unfortunately b_error gets overwritten
- * when the buffer is re-issued so we have to add another flag to
- * keep this information.
- */
-#define _XFS_BARRIER_FAILED	(1 << 23)
-
 typedef unsigned int xfs_buf_flags_t;
 
 #define XFS_BUF_FLAGS \
@@ -114,8 +106,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_RUN_QUEUES,	"RUN_QUEUES" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }, \
-	{ _XFS_BARRIER_FAILED,	"BARRIER_FAILED" }
+	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }
 
 
 typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..8fe311a456e2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -325,7 +325,6 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_cond_lock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
 DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f4..430a8fc02c1f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -916,19 +916,6 @@ xlog_iodone(xfs_buf_t *bp)
 	aborted = 0;
 	l = iclog->ic_log;
 
-	/*
-	 * If the _XFS_BARRIER_FAILED flag was set by a lower
-	 * layer, it means the underlying device no longer supports
-	 * barrier I/O. Warn loudly and turn off barriers.
-	 */
-	if (bp->b_flags & _XFS_BARRIER_FAILED) {
-		bp->b_flags &= ~_XFS_BARRIER_FAILED;
-		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		xfs_fs_cmn_err(CE_WARN, l->l_mp,
-				"xlog_iodone: Barriers are no longer supported"
-				" by device. Disabling barriers\n");
-	}
-
 	/*
 	 * Race to shutdown the filesystem if we see an error.
 	 */
-- 
cgit v1.2.3


From c3b9a62c8f932f32a733d6b628f61f3f28345727 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:12 -0400
Subject: btrfs: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes, remove the EOPNOTSUPP
detection for barriers and stop setting the barrier flag for discards.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/btrfs/disk-io.c     | 19 ++++---------------
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/volumes.c     |  4 ----
 fs/btrfs/volumes.h     |  1 -
 4 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..5e789f4a3ed0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2063,7 +2063,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (printk_ratelimit()) {
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
@@ -2200,21 +2200,10 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
 
-		if (i == last_barrier && do_barriers && device->barriers) {
-			ret = submit_bh(WRITE_BARRIER, bh);
-			if (ret == -EOPNOTSUPP) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       device->name);
-				set_buffer_uptodate(bh);
-				device->barriers = 0;
-				/* one reference for submit_bh */
-				get_bh(bh);
-				lock_buffer(bh);
-				ret = submit_bh(WRITE_SYNC, bh);
-			}
-		} else {
+		if (i == last_barrier && do_barriers)
+			ret = submit_bh(WRITE_FLUSH_FUA, bh);
+		else
 			ret = submit_bh(WRITE_SYNC, bh);
-		}
 
 		if (ret)
 			errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..43dc9ea9aef6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1696,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+			BLKDEV_IFL_WAIT);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..e25e46a8b4e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
 		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
-		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		device->devid = orig_dev->devid;
 		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		INIT_LIST_HEAD(&device->dev_list);
 		INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	trans = btrfs_start_transaction(root, 0);
 	lock_chunks(root);
 
-	device->barriers = 1;
 	device->writeable = 1;
 	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
@@ -3084,7 +3081,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..2b638b6e4eea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
 	int running_pending;
 	u64 generation;
 
-	int barriers;
 	int writeable;
 	int in_fs_metadata;
 
-- 
cgit v1.2.3


From f1e4d518c3beddf67f7722f3548eda0ec7006204 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:13 -0400
Subject: gfs2: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes, remove the EOPNOTSUPP
detection for barriers and stop setting the barrier flag for discards.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/gfs2/log.c  | 19 +++++--------------
 fs/gfs2/rgrp.c |  5 ++---
 2 files changed, 7 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index cde1248a6225..9c65170e932e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	lh->lh_hash = cpu_to_be32(hash);
 
 	bh->b_end_io = end_buffer_write_sync;
-	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | REQ_META, bh);
-	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		set_buffer_uptodate(bh);
-		fs_info(sdp, "barrier sync failed - disabling barriers\n");
-		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-		lock_buffer(bh);
-skip_barrier:
-		get_bh(bh);
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		submit_bh(WRITE_SYNC | REQ_META, bh);
-		wait_on_buffer(bh);
-	}
+	else
+		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+	wait_on_buffer(bh);
+
 	if (!buffer_uptodate(bh))
 		gfs2_io_error_bh(sdp, bh);
 	brelse(bh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..379316472918 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,8 +854,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    BLKDEV_IFL_WAIT |
-							    BLKDEV_IFL_BARRIER);
+							    BLKDEV_IFL_WAIT);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -870,7 +869,7 @@ start_new_extent:
 	}
 	if (nr_sects) {
 		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+					 BLKDEV_IFL_WAIT);
 		if (rv)
 			goto fail;
 	}
-- 
cgit v1.2.3


From 7cd33ad23ec41d685902159be8b2c6552fab8bd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:14 -0400
Subject: reiserfs: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes and remove the EOPNOTSUPP
detection for barriers.  Note that reiserfs had a fairly different code
path for barriers before as it wa the only filesystem actually making use
of them.  The new code always uses the old non-barrier codepath and just
sets the WRITE_FLUSH_FUA explicitly for the journal commits.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/reiserfs/journal.c | 106 ++++++++++----------------------------------------
 1 file changed, 20 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..076c8b194682 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
 	return 0;
 }
 
-static void disable_barrier(struct super_block *s)
-{
-	REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
-	printk("reiserfs: disabling flush barriers on %s\n",
-	       reiserfs_bdevname(s));
-}
-
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 							 *sb)
 {
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
 	submit_bh(WRITE, bh);
 }
 
-static int submit_barrier_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_ordered_io;
-	clear_buffer_dirty(bh);
-	if (!buffer_uptodate(bh))
-		BUG();
-	return submit_bh(WRITE_BARRIER, bh);
-}
-
-static void check_barrier_completion(struct super_block *s,
-				     struct buffer_head *bh)
-{
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		disable_barrier(s);
-		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
-		reiserfs_write_unlock(s);
-		sync_dirty_buffer(bh);
-		reiserfs_write_lock(s);
-	}
-}
-
 #define CHUNK_SIZE 32
 struct buffer_chunk {
 	struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
 	struct buffer_head *tbh = NULL;
 	unsigned int trans_id = jl->j_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int barrier = 0;
 	int retval = 0;
 	int write_len;
 
@@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
 	}
 	atomic_dec(&journal->j_async_throttle);
 
-	/* We're skipping the commit if there's an error */
-	if (retval || reiserfs_is_journal_aborted(journal))
-		barrier = 0;
-
-	/* wait on everything written so far before writing the commit
-	 * if we are in barrier mode, send the commit down now
-	 */
-	barrier = reiserfs_barrier_flush(s);
-	if (barrier) {
-		int ret;
-		lock_buffer(jl->j_commit_bh);
-		ret = submit_barrier_buffer(jl->j_commit_bh);
-		if (ret == -EOPNOTSUPP) {
-			set_buffer_uptodate(jl->j_commit_bh);
-			disable_barrier(s);
-			barrier = 0;
-		}
-	}
 	for (i = 0; i < (jl->j_len + 1); i++) {
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
 
 	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
 
-	if (!barrier) {
-		/* If there was a write error in the journal - we can't commit
-		 * this transaction - it will be invalid and, if successful,
-		 * will just end up propagating the write error out to
-		 * the file system. */
-		if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-			if (buffer_dirty(jl->j_commit_bh))
-				BUG();
-			mark_buffer_dirty(jl->j_commit_bh) ;
-			reiserfs_write_unlock(s);
-			sync_dirty_buffer(jl->j_commit_bh) ;
-			reiserfs_write_lock(s);
-		}
-	} else {
+	/* If there was a write error in the journal - we can't commit
+	 * this transaction - it will be invalid and, if successful,
+	 * will just end up propagating the write error out to
+	 * the file system. */
+	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+		if (buffer_dirty(jl->j_commit_bh))
+			BUG();
+		mark_buffer_dirty(jl->j_commit_bh) ;
 		reiserfs_write_unlock(s);
-		wait_on_buffer(jl->j_commit_bh);
+		if (reiserfs_barrier_flush(s))
+			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(jl->j_commit_bh);
 		reiserfs_write_lock(s);
 	}
 
-	check_barrier_completion(s, jl->j_commit_bh);
-
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
 	 * up propagating the write error out to the filesystem. */
@@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
 		jh->j_first_unflushed_offset = cpu_to_le32(offset);
 		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
 
-		if (reiserfs_barrier_flush(sb)) {
-			int ret;
-			lock_buffer(journal->j_header_bh);
-			ret = submit_barrier_buffer(journal->j_header_bh);
-			if (ret == -EOPNOTSUPP) {
-				set_buffer_uptodate(journal->j_header_bh);
-				disable_barrier(sb);
-				goto sync;
-			}
-			reiserfs_write_unlock(sb);
-			wait_on_buffer(journal->j_header_bh);
-			reiserfs_write_lock(sb);
-			check_barrier_completion(sb, journal->j_header_bh);
-		} else {
-		      sync:
-			set_buffer_dirty(journal->j_header_bh);
-			reiserfs_write_unlock(sb);
+		set_buffer_dirty(journal->j_header_bh);
+		reiserfs_write_unlock(sb);
+
+		if (reiserfs_barrier_flush(sb))
+			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+		else
 			sync_dirty_buffer(journal->j_header_bh);
-			reiserfs_write_lock(sb);
-		}
+
+		reiserfs_write_lock(sb);
 		if (!buffer_uptodate(journal->j_header_bh)) {
 			reiserfs_warning(sb, "journal-837",
 					 "IO error during journal replay");
-- 
cgit v1.2.3


From f8c131f5b6ffc899a70b30e541f367d47f89691c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:15 -0400
Subject: nilfs2: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes, remove the EOPNOTSUPP
detection for barriers and stop setting the barrier flag for discards.

tj: nilfs is now fixed to wait for discard completion.  Updated this
    patch accordingly and dropped warning about it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/nilfs2/super.c     | 10 +---------
 fs/nilfs2/the_nilfs.c |  7 ++-----
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..faa5078ff751 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -178,17 +178,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 
  retry:
 	set_buffer_dirty(nilfs->ns_sbh[0]);
-
 	if (nilfs_test_opt(sbi, BARRIER)) {
 		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-					  WRITE_SYNC | WRITE_BARRIER);
-		if (err == -EOPNOTSUPP) {
-			nilfs_warning(sbi->s_super, __func__,
-				      "barrier-based sync failed. "
-				      "disabling barriers\n");
-			nilfs_clear_opt(sbi, BARRIER);
-			goto retry;
-		}
+					  WRITE_SYNC | WRITE_FLUSH_FUA);
 	} else {
 		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 4317f177ea7c..400b2caef4d8 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -774,9 +774,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 			ret = blkdev_issue_discard(nilfs->ns_bdev,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
-						   GFP_NOFS,
-						   BLKDEV_IFL_WAIT |
-						   BLKDEV_IFL_BARRIER);
+						   GFP_NOFS, BLKDEV_IFL_WAIT);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -786,8 +784,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS,
-					  BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+					   GFP_NOFS, BLKDEV_IFL_WAIT);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4524451ef7e88c64a868a8f5a0b49bda73beb2a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:16 -0400
Subject: jbd: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for journal commits and remove the
EOPNOTSUPP detection for barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd/commit.c | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..484c5e5fa8af 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
 	JBUFFER_TRACE(descriptor, "write commit block");
 	set_buffer_dirty(bh);
 
-	if (journal->j_flags & JFS_BARRIER) {
-		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-
-		/*
-		 * Is it possible for another commit to fail at roughly
-		 * the same time as this one?  If so, we don't want to
-		 * trust the barrier flag in the super, but instead want
-		 * to remember if we sent a barrier request
-		 */
-		if (ret == -EOPNOTSUPP) {
-			char b[BDEVNAME_SIZE];
-
-			printk(KERN_WARNING
-				"JBD: barrier-based sync failed on %s - "
-				"disabling barriers\n",
-				bdevname(journal->j_dev, b));
-			spin_lock(&journal->j_state_lock);
-			journal->j_flags &= ~JFS_BARRIER;
-			spin_unlock(&journal->j_state_lock);
-
-			/* And try again, without the barrier */
-			set_buffer_uptodate(bh);
-			set_buffer_dirty(bh);
-			ret = sync_dirty_buffer(bh);
-		}
-	} else {
+	if (journal->j_flags & JFS_BARRIER)
+		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+	else
 		ret = sync_dirty_buffer(bh);
-	}
 
 	put_bh(bh);		/* One for getblk() */
 	journal_put_journal_head(descriptor);
-- 
cgit v1.2.3


From f73bee49855fe968e87af18e44f2d17e1e675e74 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 18 Aug 2010 15:56:56 +0200
Subject: jbd2: Modify ASYNC_COMMIT code to not rely on queue draining on
 barrier

Currently JBD2 relies blkdev_issue_flush() draining the queue when ASYNC_COMMIT
feature is set. This property is going away so make JBD2 wait for buffers it
needs on its own before submitting the cache flush.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd2/commit.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..d6aeb1f6cfe0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -701,6 +701,16 @@ start_journal_io:
 		}
 	}
 
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err) {
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
+		err = 0;
+	}
+
 	/* 
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
@@ -719,19 +729,6 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-		if (journal->j_flags & JBD2_BARRIER)
-			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
-	}
-
-	err = journal_finish_inode_data_buffers(journal, commit_transaction);
-	if (err) {
-		printk(KERN_WARNING
-			"JBD2: Detected IO errors while flushing file data "
-		       "on %s\n", journal->j_devname);
-		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-			jbd2_journal_abort(journal, err);
-		err = 0;
 	}
 
 	/* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +842,12 @@ wait_for_iobuf:
 	}
 	if (!err && !is_journal_aborted(journal))
 		err = journal_wait_on_commit_record(journal, cbh);
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+	    journal->j_flags & JBD2_BARRIER) {
+		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+				   BLKDEV_IFL_WAIT);
+	}
 
 	if (err)
 		jbd2_journal_abort(journal, err);
-- 
cgit v1.2.3


From 9c35575bbe6b1dd4914a5323c8df8b3159edcc75 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:17 -0400
Subject: jbd2: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for journal commits and remove the
EOPNOTSUPP detection for barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd2/commit.c | 43 ++++---------------------------------------
 1 file changed, 4 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d6aeb1f6cfe0..f204e27f44d1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -134,25 +134,11 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
-				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-		if (ret == -EOPNOTSUPP) {
-			printk(KERN_WARNING
-			       "JBD2: Disabling barriers on %s, "
-			       "not supported by device\n", journal->j_devname);
-			write_lock(&journal->j_state_lock);
-			journal->j_flags &= ~JBD2_BARRIER;
-			write_unlock(&journal->j_state_lock);
-
-			/* And try again, without the barrier */
-			lock_buffer(bh);
-			set_buffer_uptodate(bh);
-			clear_buffer_dirty(bh);
-			ret = submit_bh(WRITE_SYNC_PLUG, bh);
-		}
-	} else {
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+	else
 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
-	}
+
 	*cbh = bh;
 	return ret;
 }
@@ -166,29 +152,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
 {
 	int ret = 0;
 
-retry:
 	clear_buffer_dirty(bh);
 	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-		printk(KERN_WARNING
-		       "JBD2: %s: disabling barries on %s - not supported "
-		       "by device\n", __func__, journal->j_devname);
-		write_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JBD2_BARRIER;
-		write_unlock(&journal->j_state_lock);
-
-		lock_buffer(bh);
-		clear_buffer_dirty(bh);
-		set_buffer_uptodate(bh);
-		bh->b_end_io = journal_end_buffer_io_sync;
-
-		ret = submit_bh(WRITE_SYNC_PLUG, bh);
-		if (ret) {
-			unlock_buffer(bh);
-			return ret;
-		}
-		goto retry;
-	}
 
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
-- 
cgit v1.2.3


From 61002f7db33c7d064cddcdab680fb750fa43d8fd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:18 -0400
Subject: ext4: do not send discards as barriers

ext4 already uses synchronous discards, no need to add I/O barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index df44b345f662..a22bfef3da95 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2567,7 +2567,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
 	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
-			       BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+			       BLKDEV_IFL_WAIT);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-- 
cgit v1.2.3


From e045db80d07250312500b2ed707b84dc703189d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:19 -0400
Subject: fat: do not send discards as barriers

fat already uses synchronous discards, no need to add I/O barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fat/fatent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 3a56a82f5658..f9a0b7ae8648 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -579,7 +579,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 					fat_clus_to_blknr(sbi, first_cl),
 					nr_clus * sbi->sec_per_clus,
 					GFP_NOFS,
-					BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+					BLKDEV_IFL_WAIT);
 
 				first_cl = cluster;
 			}
-- 
cgit v1.2.3


From 0edd55faea7c8081bc826234b917501738a6218f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:23 -0400
Subject: block: remove the BH_Eopnotsupp flag

This flag was only set for barrier buffers, which we don't submit
anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/buffer.c                 | 7 +------
 fs/fat/misc.c               | 5 +----
 include/linux/buffer_head.h | 2 --
 3 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..7f0b9b083f77 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2891,7 +2891,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3030,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 		bh->b_end_io = end_buffer_write_sync;
 		ret = submit_bh(rw, bh);
 		wait_on_buffer(bh);
-		if (buffer_eopnotsupp(bh)) {
-			clear_buffer_eopnotsupp(bh);
-			ret = -EOPNOTSUPP;
-		}
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
 	} else {
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 
 	for (i = 0; i < nr_bhs; i++) {
 		wait_on_buffer(bhs[i]);
-		if (buffer_eopnotsupp(bhs[i])) {
-			clear_buffer_eopnotsupp(bhs[i]);
-			err = -EOPNOTSUPP;
-		} else if (!err && !buffer_uptodate(bhs[i]))
+		if (!err && !buffer_uptodate(bhs[i]))
 			err = -EIO;
 	}
 	return err;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index fc999f583fda..dd1b25b2641c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,6 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
-	BH_Eopnotsupp,	/* DEPRECATED: operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
@@ -124,7 +123,6 @@ BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
-BUFFER_FNS(Eopnotsupp, eopnotsupp)
 BUFFER_FNS(Unwritten, unwritten)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
-- 
cgit v1.2.3


From ddee5cdb70e6f87de2fc696b87bd7bd184a51eb8 Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Sat, 22 May 2010 16:26:33 +0800
Subject: Ocfs2: Add new OCFS2_IOC_INFO ioctl for ocfs2 v8.

The reason why we need this ioctl is to offer the none-privileged
end-user a possibility to get filesys info gathering.

We use OCFS2_IOC_INFO to manipulate the new ioctl, userspace passes a
structure to kernel containing an array of request pointers and request
count, such as,

* From userspace:

struct ocfs2_info_blocksize oib = {
        .ib_req = {
                .ir_magic = OCFS2_INFO_MAGIC,
                .ir_code = OCFS2_INFO_BLOCKSIZE,
                ...
        }
        ...
}

struct ocfs2_info_clustersize oic = {
        ...
}

uint64_t reqs[2] = {(unsigned long)&oib,
                    (unsigned long)&oic};

struct ocfs2_info info = {
        .oi_requests = reqs,
        .oi_count = 2,
}

ret = ioctl(fd, OCFS2_IOC_INFO, &info);

* In kernel:

Get the request pointers from *info*, then handle each request one bye one.

Idea here is to make the spearated request small enough to guarantee
a better backward&forward compatibility since a small piece of request
would be less likely to be broken if filesys on raw disk get changed.

Currently, the following 7 requests are supported per the requirement from
userspace tool o2info, and I believe it will grow over time:-)

        OCFS2_INFO_CLUSTERSIZE
        OCFS2_INFO_BLOCKSIZE
        OCFS2_INFO_MAXSLOTS
        OCFS2_INFO_LABEL
        OCFS2_INFO_UUID
        OCFS2_INFO_FS_FEATURES
        OCFS2_INFO_JOURNAL_SIZE

This ioctl is only specific to OCFS2.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/ioctl.c       | 356 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/ocfs2_ioctl.h |  95 +++++++++++++
 2 files changed, 451 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
 
 #include <linux/ext2_fs.h>
 
+#define o2info_from_user(a, b)	\
+		copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)	\
+		copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+					struct ocfs2_info_request __user *req)
+{
+	kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+	(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+		__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
 	int status;
@@ -109,6 +129,328 @@ bail:
 	return status;
 }
 
+int ocfs2_info_handle_blocksize(struct inode *inode,
+				struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_blocksize oib;
+
+	if (o2info_from_user(oib, req))
+		goto bail;
+
+	oib.ib_blocksize = inode->i_sb->s_blocksize;
+	oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oib, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oib, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_clustersize oic;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oic, req))
+		goto bail;
+
+	oic.ic_clustersize = osb->s_clustersize;
+	oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oic, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oic, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+			       struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_maxslots oim;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oim, req))
+		goto bail;
+
+	oim.im_max_slots = osb->max_slots;
+	oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oim, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oim, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+			    struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_label oil;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oil, req))
+		goto bail;
+
+	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+	oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oil, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oil, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+			   struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_uuid oiu;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oiu, req))
+		goto bail;
+
+	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+	oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oiu, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oiu, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_fs_features oif;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oif, req))
+		goto bail;
+
+	oif.if_compat_features = osb->s_feature_compat;
+	oif.if_incompat_features = osb->s_feature_incompat;
+	oif.if_ro_compat_features = osb->s_feature_ro_compat;
+	oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oif, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oif, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+				   struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_journal_size oij;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oij, req))
+		goto bail;
+
+	oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+	oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oij, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oij, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+			      struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oir, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oir, req);
+
+	return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+			      struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	status = -EINVAL;
+	if (oir.ir_magic != OCFS2_INFO_MAGIC)
+		goto bail;
+
+	switch (oir.ir_code) {
+	case OCFS2_INFO_BLOCKSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+			status = ocfs2_info_handle_blocksize(inode, req);
+		break;
+	case OCFS2_INFO_CLUSTERSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+			status = ocfs2_info_handle_clustersize(inode, req);
+		break;
+	case OCFS2_INFO_MAXSLOTS:
+		if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+			status = ocfs2_info_handle_maxslots(inode, req);
+		break;
+	case OCFS2_INFO_LABEL:
+		if (oir.ir_size == sizeof(struct ocfs2_info_label))
+			status = ocfs2_info_handle_label(inode, req);
+		break;
+	case OCFS2_INFO_UUID:
+		if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+			status = ocfs2_info_handle_uuid(inode, req);
+		break;
+	case OCFS2_INFO_FS_FEATURES:
+		if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+			status = ocfs2_info_handle_fs_features(inode, req);
+		break;
+	case OCFS2_INFO_JOURNAL_SIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+			status = ocfs2_info_handle_journal_size(inode, req);
+		break;
+	default:
+		status = ocfs2_info_handle_unknown(inode, req);
+		break;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+			  u64 *req_addr, int compat_flag)
+{
+	int status = -EFAULT;
+	u64 __user *bp = NULL;
+
+	if (compat_flag) {
+#ifdef CONFIG_COMPAT
+		/*
+		 * pointer bp stores the base address of a pointers array,
+		 * which collects all addresses of separate request.
+		 */
+		bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+		BUG();
+#endif
+	} else
+		bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+	if (o2info_from_user(*req_addr, bp + idx))
+		goto bail;
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+		      int compat_flag)
+{
+	int i, status = 0;
+	u64 req_addr;
+	struct ocfs2_info_request __user *reqp;
+
+	if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+	    (!info->oi_requests)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	for (i = 0; i < info->oi_count; i++) {
+
+		status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+		if (status)
+			break;
+
+		reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+		if (!reqp) {
+			status = -EINVAL;
+			goto bail;
+		}
+
+		status = ocfs2_info_handle_request(inode, reqp);
+		if (status)
+			break;
+	}
+
+bail:
+	return status;
+}
+
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct reflink_arguments args;
 	const char *old_path, *new_path;
 	bool preserve;
+	struct ocfs2_info info;
 
 	switch (cmd) {
 	case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		preserve = (args.preserve != 0);
 
 		return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+				   sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 0);
 	default:
 		return -ENOTTY;
 	}
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	bool preserve;
 	struct reflink_arguments args;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_info info;
 
 	switch (cmd) {
 	case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
 		return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
 					   compat_ptr(args.new_path), preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+				   sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 1);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..9bc535499868 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
 };
 #define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
 
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST		(50)
+#define OCFS2_TEXT_UUID_LEN		(OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC		(0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+	__u64 oi_requests;	/* Array of __u64 pointers to requests */
+	__u32 oi_count;		/* Number of requests in info_requests */
+	__u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/	__u32 ir_magic;	/* Magic number */
+	__u32 ir_code;	/* Info request code */
+	__u32 ir_size;	/* Size of request */
+	__u32 ir_flags;	/* Request flags */
+/*10*/			/* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+	struct ocfs2_info_request ic_req;
+	__u32 ic_clustersize;
+	__u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+	struct ocfs2_info_request ib_req;
+	__u32 ib_blocksize;
+	__u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+	struct ocfs2_info_request im_req;
+	__u32 im_max_slots;
+	__u32 im_pad;
+};
+
+struct ocfs2_info_label {
+	struct ocfs2_info_request il_req;
+	__u8	il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+	struct ocfs2_info_request iu_req;
+	__u8	iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+	struct ocfs2_info_request if_req;
+	__u32 if_compat_features;
+	__u32 if_incompat_features;
+	__u32 if_ro_compat_features;
+	__u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+	struct ocfs2_info_request ij_req;
+	__u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+	OCFS2_INFO_CLUSTERSIZE = 1,
+	OCFS2_INFO_BLOCKSIZE,
+	OCFS2_INFO_MAXSLOTS,
+	OCFS2_INFO_LABEL,
+	OCFS2_INFO_UUID,
+	OCFS2_INFO_FS_FEATURES,
+	OCFS2_INFO_JOURNAL_SIZE,
+	OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT	(0x00000001)	/* Cluster coherency not
+							   required. This is a hint.
+							   It is up to ocfs2 whether
+							   the request can be fulfilled
+							   without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED		(0x40000000)	/* Filesystem understood
+							   this request and
+							   filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR		(0x80000000)	/* Error happened during
+							   request handling. */
+
+#define OCFS2_IOC_INFO		_IOR('o', 5, struct ocfs2_info)
+
 #endif /* OCFS2_IOCTL_H */
-- 
cgit v1.2.3


From 3c3f20c9813391ba4004764072989744395cf405 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 1 Jun 2010 13:58:13 +0800
Subject: ocfs2: Add some trace log for orphan scan.

Now orphan scan worker has no trace log, so it is
very hard to tell whether it is finished or blocked.
So add 2 mlog trace log so that we can tell whether
the current orphan scan worker is blocked or not.
It does help when I analyzed a orphan scan bug.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..04d41dfeab9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1888,6 +1888,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 
 	os = &osb->osb_orphan_scan;
 
+	mlog(0, "Begin orphan scan\n");
+
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
 		goto out;
 
@@ -1920,6 +1922,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 unlock:
 	ocfs2_orphan_scan_unlock(osb, seqno);
 out:
+	mlog(0, "Orphan scan completed\n");
 	return;
 }
 
-- 
cgit v1.2.3


From 95fa859a268fd7d9bae6f2d4d095e3f61100ac1b Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 9 Jun 2010 16:48:59 +0800
Subject: ocfs2: Remove obscure error handling in direct_write.

In ocfs2, actually we don't allow any direct write pass i_size,
see the function ocfs2_prepare_inode_for_write. So we don't
need the bogus simple_setsize.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..4f9133f6368e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2312,17 +2312,6 @@ relock:
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
-			/*
-			 * direct write may have instantiated a few
-			 * blocks outside i_size. Trim these off again.
-			 * Don't need i_size_read because we hold i_mutex.
-			 *
-			 * XXX(truncate): this looks buggy because ocfs2 did not
-			 * actually implement ->truncate.  Take a look at
-			 * the new truncate sequence and update this accordingly
-			 */
-			if (*ppos + count > inode->i_size)
-				truncate_setsize(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
-- 
cgit v1.2.3


From 83fd9c7f65634ac440a6b9b7a63ba562f213ac60 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Thu, 10 Jun 2010 17:21:36 -0500
Subject: Reorganize data elements to reduce struct sizes

Thanks for the comments. I have incorportated them all.

CONFIG_OCFS2_FS_STATS is enabled and CONFIG_DEBUG_LOCK_ALLOC is disabled.
Statistics now look like -
ocfs2_write_ctxt: 2144 - 2136 = 8
ocfs2_inode_info: 1960 - 1848 = 112
ocfs2_journal: 168 - 160 = 8
ocfs2_lock_res: 336 - 304 = 32
ocfs2_refcount_tree: 512 - 472 = 40

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c         |  2 +-
 fs/ocfs2/inode.h        | 11 ++++-------
 fs/ocfs2/journal.h      |  3 ++-
 fs/ocfs2/ocfs2.h        | 23 +++++++++++++++--------
 fs/ocfs2/refcounttree.h |  4 ++--
 5 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..f477f18b35d5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
 	 * out in so that future reads from that region will get
 	 * zero's.
 	 */
-	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
 	unsigned int			w_num_pages;
+	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
 	struct page			*w_target_page;
 
 	/*
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..0bc477a3aeb8 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,27 +46,24 @@ struct ocfs2_inode_info
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
-	u32				ip_clusters;
 	struct list_head		ip_io_markers;
+	u32				ip_clusters;
 
+	u16				ip_dyn_features;
 	struct mutex			ip_io_mutex;
-
 	u32				ip_flags; /* see below */
 	u32				ip_attr; /* inode attributes */
-	u16				ip_dyn_features;
 
 	/* protected by recovery_lock. */
 	struct inode			*ip_next_orphan;
 
-	u32				ip_dir_start_lookup;
-
 	struct ocfs2_caching_info	ip_metadata_cache;
-
 	struct ocfs2_extent_map		ip_extent_map;
-
 	struct inode			vfs_inode;
 	struct jbd2_inode		ip_jinode;
 
+	u32				ip_dir_start_lookup;
+
 	/* Only valid if the inode is the dir. */
 	u32				ip_last_used_slot;
 	u64				ip_last_used_group;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
 	struct buffer_head        *j_bh;      /* Journal disk inode block */
 	atomic_t                  j_num_trans; /* Number of transactions
 					        * currently in the system. */
+	spinlock_t                j_lock;
 	unsigned long             j_trans_id;
 	struct rw_semaphore       j_trans_barrier;
 	wait_queue_head_t         j_checkpointed;
 
-	spinlock_t                j_lock;
+	/* both fields protected by j_lock*/
 	struct list_head          j_la_cleanups;
 	struct work_struct        j_recovery_work;
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..65739b3b3276 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
 struct ocfs2_lock_res {
 	void                    *l_priv;
 	struct ocfs2_lock_res_ops *l_ops;
-	spinlock_t               l_lock;
+
 
 	struct list_head         l_blocked_list;
 	struct list_head         l_mask_waiters;
 
-	enum ocfs2_lock_type     l_type;
 	unsigned long		 l_flags;
 	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
-	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	struct ocfs2_dlm_lksb    l_lksb;
+	unsigned char            l_level;
+
+	/* Data packed - type enum ocfs2_lock_type */
+	unsigned char            l_type;
 
 	/* used from AST/BAST funcs. */
-	enum ocfs2_ast_action    l_action;
-	enum ocfs2_unlock_action l_unlock_action;
-	int                      l_requested;
-	int                      l_blocking;
+	/* Data packed - enum type ocfs2_ast_action */
+	unsigned char            l_action;
+	/* Data packed - enum type ocfs2_unlock_action */
+	unsigned char            l_unlock_action;
+	unsigned char            l_requested;
+	unsigned char            l_blocking;
 	unsigned int             l_pending_gen;
 
+	spinlock_t               l_lock;
+
+	struct ocfs2_dlm_lksb    l_lksb;
+
 	wait_queue_head_t        l_event;
 
 	struct list_head         l_debug_list;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..f04892d6175d 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
 	struct rb_node rf_node;
 	u64 rf_blkno;
 	u32 rf_generation;
+	struct kref rf_getcnt;
 	struct rw_semaphore rf_sem;
 	struct ocfs2_lock_res rf_lockres;
-	struct kref rf_getcnt;
 	int rf_removed;
 
 	/* the following 4 fields are used by caching_info. */
-	struct ocfs2_caching_info rf_ci;
 	spinlock_t rf_lock;
+	struct ocfs2_caching_info rf_ci;
 	struct mutex rf_io_mutex;
 	struct super_block *rf_sb;
 };
-- 
cgit v1.2.3


From 4c38881f87c21ada5609a4065cb10e3e74da0d6e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 5 Aug 2010 20:32:46 +0200
Subject: ocfs2: Remove ocfs2_sync_inode()

ocfs2_sync_inode() is used only from ocfs2_sync_file(). But all data has
already been written before calling ocfs2_sync_file() and ocfs2 doesn't use
inode's private_list for tracking metadata buffers thus sync_mapping_buffers()
is superfluous as well.

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4f9133f6368e..b03f6601fd71 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
 
 #include "buffer_head_io.h"
 
-static int ocfs2_sync_inode(struct inode *inode)
-{
-	filemap_fdatawrite(inode->i_mapping);
-	return sync_mapping_buffers(inode->i_mapping);
-}
-
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
 	struct ocfs2_file_private *fp;
@@ -187,10 +181,6 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
 		   dentry->d_name.len, dentry->d_name.name);
 
-	err = ocfs2_sync_inode(dentry->d_inode);
-	if (err)
-		goto bail;
-
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
 		 * We still have to flush drive's caches to get data to the
-- 
cgit v1.2.3


From f9c57ada32ea3f2e12600cf274035fff063b2e0f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 2 Aug 2010 11:02:13 +0800
Subject: ocfs2: Remove unused old_id in ocfs2_commit_cache.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 04d41dfeab9a..5787802c2d6a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
 	int status = 0;
 	unsigned int flushed;
-	unsigned long old_id;
 	struct ocfs2_journal *journal = NULL;
 
 	mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 		goto finally;
 	}
 
-	old_id = ocfs2_inc_trans_id(journal);
+	ocfs2_inc_trans_id(journal);
 
 	flushed = atomic_read(&journal->j_num_trans);
 	atomic_set(&journal->j_num_trans, 0);
-- 
cgit v1.2.3


From 17ae521158d6fe89cfdd00a9990aa40d02e81391 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 2 Aug 2010 11:02:14 +0800
Subject: ocfs2: Remove obsolete comments before ocfs2_start_trans.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 5787802c2d6a..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -341,9 +341,6 @@ finally:
 	return status;
 }
 
-/* pass it NULL and it will allocate a new handle object for you.  If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
 handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 {
 	journal_t *journal = osb->journal->j_journal;
-- 
cgit v1.2.3


From 30ca22c70e3ef0a96ff84de69cd7e8561b416cb2 Mon Sep 17 00:00:00 2001
From: "Patrick J. LoPresti" <lopresti@gmail.com>
Date: Thu, 22 Jul 2010 15:03:41 -0700
Subject: ext3/ext4: Factor out disk addressability check

As part of adding support for OCFS2 to mount huge volumes, we need to
check that the sector_t and page cache of the system are capable of
addressing the entire volume.

An identical check already appears in ext3 and ext4.  This patch moves
the addressability check into its own function in fs/libfs.c and
modifies ext3 and ext4 to invoke it.

[Edited to -EINVAL instead of BUG_ON() for bad blocksize_bits -- Joel]

Signed-off-by: Patrick LoPresti <lopresti@gmail.com>
Cc: linux-ext4@vger.kernel.org
Acked-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ext3/super.c    |  4 ++--
 fs/ext4/super.c    |  8 +++-----
 fs/libfs.c         | 29 +++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 4 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4dba03c4..a367dd044280 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1849,8 +1849,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (le32_to_cpu(es->s_blocks_count) >
-		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+	if (generic_check_addressable(sb->s_blocksize_bits,
+				      le32_to_cpu(es->s_blocks_count))) {
 		ext3_msg(sb, KERN_ERR,
 			"error: filesystem is too large to mount safely");
 		if (sizeof(sector_t) < 8)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..7f47c366bf15 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2831,15 +2831,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * Test whether we have more sectors than will fit in sector_t,
 	 * and whether the max offset is addressable by the page cache.
 	 */
-	if ((ext4_blocks_count(es) >
-	     (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
-	    (ext4_blocks_count(es) >
-	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+	ret = generic_check_addressable(sb->s_blocksize_bits,
+					ext4_blocks_count(es));
+	if (ret) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-		ret = -EFBIG;
 		goto failed_mount;
 	}
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 0a9da95317f7..8debe7b33769 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync)
 }
 EXPORT_SYMBOL(generic_file_fsync);
 
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:	log of file system block size
+ * @num_blocks:		number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+	u64 last_fs_block = num_blocks - 1;
+
+	if (unlikely(num_blocks == 0))
+		return 0;
+
+	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+		return -EINVAL;
+
+	if ((last_fs_block >
+	     (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+	    (last_fs_block >
+	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - blocksize_bits))) {
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
 /*
  * No-op implementation of ->fsync for in-memory filesystems.
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76041b614758..1a759f40ab9e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2374,6 +2374,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 
 extern int generic_file_fsync(struct file *, int);
 
+extern int generic_check_addressable(unsigned, u64);
+
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *);
-- 
cgit v1.2.3


From 1113e1b504f6e8d4364c0b73c9097828067d4617 Mon Sep 17 00:00:00 2001
From: "Patrick J. LoPresti" <lopresti@gmail.com>
Date: Thu, 22 Jul 2010 15:04:16 -0700
Subject: JBD2: Allow feature checks before journal recovery

Before we start accessing a huge (> 16 TiB) OCFS2 volume, we need to
confirm that its journal supports 64-bit offsets.  In particular, we
need to check the journal's feature bits before recovering the journal.

This is not possible with JBD2 at present, because the journal
superblock (where the feature bits reside) is not loaded from disk until
the journal is recovered.

This patch loads the journal superblock in
jbd2_journal_check_used_features() if it has not already been loaded,
allowing us to check the feature bits before journal recovery.

Signed-off-by: Patrick LoPresti <lopresti@gmail.com>
Cc: linux-ext4@vger.kernel.org
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/jbd2/journal.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..262419f83d80 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
 
 	if (!compat && !ro && !incompat)
 		return 1;
+	/* Load journal superblock if it is not loaded yet. */
+	if (journal->j_format_version == 0 &&
+	    journal_get_superblock(journal) != 0)
+		return 0;
 	if (journal->j_format_version == 1)
 		return 0;
 
-- 
cgit v1.2.3


From 3bdb8efd94a73bb137e3315cd831cbc874052b4b Mon Sep 17 00:00:00 2001
From: "Patrick J. LoPresti" <lopresti@gmail.com>
Date: Thu, 22 Jul 2010 15:05:57 -0700
Subject: OCFS2: Allow huge (> 16 TiB) volumes to mount

The OCFS2 developers have already done all of the hard work to allow
volumes larger than 16 TiB.  But there is still a "sanity check" in
fs/ocfs2/super.c that prevents the mounting of such volumes, even when
the cluster size and journal options would allow it.

This patch replaces that sanity check with a more sophisticated one to
mount a huge volume provided that (a) it is addressable by the raw
word/address size of the system (borrowing a test from ext4); (b) the
volume is using JBD2; and (c) the JBD2_FEATURE_INCOMPAT_64BIT flag is
set on the journal.

I factored out the sanity check into its own function.  I also moved it
from ocfs2_initialize_super() down to ocfs2_check_volume(); any earlier,
and the journal will not have been initialized yet.

This patch is one of a pair, and it depends on the other ("JBD2: Allow
feature checks before journal recovery").

I have tested this patch on small volumes, huge volumes, and huge
volumes without 64-bit block support in the journal.  All of them appear
to work or to fail gracefully, as appropriate.

Signed-off-by: Patrick LoPresti <lopresti@gmail.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/super.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..47415398d56a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1990,6 +1990,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
 	return 0;
 }
 
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+	int status = 0;
+	u64 max_block =
+		ocfs2_clusters_to_blocks(osb->sb,
+					 osb->osb_clusters_at_boot) - 1;
+
+	/* 32-bit block number is always OK. */
+	if (max_block <= (u32)~0ULL)
+		goto out;
+
+	/* Volume is "huge", so see if our journal is new enough to
+	   support it. */
+	if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				       OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+	      jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+					       JBD2_FEATURE_INCOMPAT_64BIT))) {
+		mlog(ML_ERROR, "The journal cannot address the entire volume. "
+		     "Enable the 'block64' journal option with tunefs.ocfs2");
+		status = -EFBIG;
+		goto out;
+	}
+
+ out:
+	return status;
+}
+
 static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
 				  int sector_size,
@@ -2002,6 +2032,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	struct ocfs2_journal *journal;
 	__le32 uuid_net_key;
 	struct ocfs2_super *osb;
+	u64 total_blocks;
 
 	mlog_entry_void();
 
@@ -2214,11 +2245,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
-	    > (u32)~0UL) {
-		mlog(ML_ERROR, "Volume might try to write to blocks beyond "
-		     "what jbd can address in 32 bits.\n");
-		status = -EINVAL;
+	total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(di->i_clusters));
+
+	status = generic_check_addressable(osb->sb->s_blocksize_bits,
+					   total_blocks);
+	if (status) {
+		mlog(ML_ERROR, "Volume too large "
+		     "to mount safely on this system");
+		status = -EFBIG;
 		goto bail;
 	}
 
@@ -2380,6 +2415,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 		goto finally;
 	}
 
+	/* Now that journal has been initialized, check to make sure
+	   entire volume is addressable. */
+	status = ocfs2_journal_addressable(osb);
+	if (status)
+		goto finally;
+
 	/* If the journal was unmounted cleanly then we don't want to
 	 * recover anything. Otherwise, journal_load will do that
 	 * dirty work for us :) */
-- 
cgit v1.2.3


From a33f13efe05192e7a805018a2ce2b2afddd04057 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 16 Aug 2010 12:10:17 -0700
Subject: libfs: Fix shift bug in generic_check_addressable()

generic_check_addressable() erroneously shifts pages down by a block
factor when it should be shifting up.  To prevent overflow, we shift
blocks down to pages.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/libfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 8debe7b33769..62baa0387d6e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -925,6 +925,8 @@ EXPORT_SYMBOL(generic_file_fsync);
 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
 {
 	u64 last_fs_block = num_blocks - 1;
+	u64 last_fs_page =
+		last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
 
 	if (unlikely(num_blocks == 0))
 		return 0;
@@ -932,10 +934,8 @@ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
 	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
 		return -EINVAL;
 
-	if ((last_fs_block >
-	     (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
-	    (last_fs_block >
-	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - blocksize_bits))) {
+	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+	    (last_fs_page > (pgoff_t)(~0ULL))) {
 		return -EFBIG;
 	}
 	return 0;
-- 
cgit v1.2.3


From b4d693fcc5fe99ed211addb5c6a0f8398f0b266e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 16 Aug 2010 16:58:21 +0800
Subject: ocfs2: Cache system inodes of other slots.

Durring orphan scan, if we are slot 0, and we are replaying
orphan_dir:0001, the general process is that for every file
in this dir:
1. we will iget orphan_dir:0001, since there is no inode for it.
   we will have to create an inode and read it from the disk.
2. do the normal work, such as delete_inode and remove it from
   the dir if it is allowed.
3. call iput orphan_dir:0001 when we are done. In this case,
   since we have no dcache for this inode, i_count will
   reach 0, and VFS will have to call clear_inode and in
   ocfs2_clear_inode we will checkpoint the inode which will let
   ocfs2_cmt and journald begin to work.
4. We loop back to 1 for the next file.

So you see, actually for every deleted file, we have to read the
orphan dir from the disk and checkpoint the journal. It is very
time consuming and cause a lot of journal checkpoint I/O.
A better solution is that we can have another reference for these
inodes in ocfs2_super. So if there is no other race among
nodes(which will let dlmglue to checkpoint the inode), for step 3,
clear_inode won't be called and for step 1, we may only need to
read the inode for the 1st time. This is a big win for us.

So this patch will try to cache system inodes of other slots so
that we will have one more reference for these inodes and avoid
the extra inode read and journal checkpoint.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/ocfs2.h    |  3 ++-
 fs/ocfs2/ocfs2_fs.h |  5 +++++
 fs/ocfs2/super.c    | 20 +++++++++++++++---
 fs/ocfs2/sysfile.c  | 60 +++++++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 73 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 65739b3b3276..687e291d73f2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -284,7 +284,8 @@ struct ocfs2_super
 	struct super_block *sb;
 	struct inode *root_inode;
 	struct inode *sys_root_inode;
-	struct inode *system_inodes[NUM_SYSTEM_INODES];
+	struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+	struct inode **local_system_inodes;
 
 	struct ocfs2_slot_info *slot_info;
 
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258d..723b20dac414 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -309,6 +309,7 @@ enum {
 	USER_QUOTA_SYSTEM_INODE,
 	GROUP_QUOTA_SYSTEM_INODE,
 #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
 	ORPHAN_DIR_SYSTEM_INODE,
 	EXTENT_ALLOC_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
@@ -317,8 +318,12 @@ enum {
 	TRUNCATE_LOG_SYSTEM_INODE,
 	LOCAL_USER_QUOTA_SYSTEM_INODE,
 	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
 	NUM_SYSTEM_INODES
 };
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES	\
+		(NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
 
 static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	/* Global system inodes (single copy) */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 47415398d56a..350e8b5a9396 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -514,11 +514,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
-		inode = osb->system_inodes[i];
+	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+		inode = osb->global_system_inodes[i];
 		if (inode) {
 			iput(inode);
-			osb->system_inodes[i] = NULL;
+			osb->global_system_inodes[i] = NULL;
 		}
 	}
 
@@ -534,6 +534,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 		osb->root_inode = NULL;
 	}
 
+	if (!osb->local_system_inodes)
+		goto out;
+
+	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+		if (osb->local_system_inodes[i]) {
+			iput(osb->local_system_inodes[i]);
+			osb->local_system_inodes[i] = NULL;
+		}
+	}
+
+	kfree(osb->local_system_inodes);
+	osb->local_system_inodes = NULL;
+
+out:
 	mlog_exit(0);
 }
 
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 						   int type,
 						   u32 slot);
 
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-					   int type,
-					   u32 slot);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
 #endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
 		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
 }
 
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-					   int type,
-					   u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+					     int type,
+					     u32 slot)
 {
-	return slot == osb->slot_num || is_global_system_inode(type);
+	int index;
+	struct inode **local_system_inodes, **free = NULL;
+
+	BUG_ON(slot == OCFS2_INVALID_SLOT);
+	BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+	       type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+	spin_lock(&osb->osb_lock);
+	local_system_inodes = osb->local_system_inodes;
+	spin_unlock(&osb->osb_lock);
+
+	if (unlikely(!local_system_inodes)) {
+		local_system_inodes = kzalloc(sizeof(struct inode *) *
+					      NUM_LOCAL_SYSTEM_INODES *
+					      osb->max_slots,
+					      GFP_NOFS);
+		if (!local_system_inodes) {
+			mlog_errno(-ENOMEM);
+			/*
+			 * return NULL here so that ocfs2_get_sytem_file_inodes
+			 * will try to create an inode and use it. We will try
+			 * to initialize local_system_inodes next time.
+			 */
+			return NULL;
+		}
+
+		spin_lock(&osb->osb_lock);
+		if (osb->local_system_inodes) {
+			/* Someone has initialized it for us. */
+			free = local_system_inodes;
+			local_system_inodes = osb->local_system_inodes;
+		} else
+			osb->local_system_inodes = local_system_inodes;
+		spin_unlock(&osb->osb_lock);
+		if (unlikely(free))
+			kfree(free);
+	}
+
+	index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+		(type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+	return &local_system_inodes[index];
 }
 
 struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 	struct inode **arr = NULL;
 
 	/* avoid the lookup if cached in local system file array */
-	if (is_in_system_inode_array(osb, type, slot))
-		arr = &(osb->system_inodes[type]);
+	if (is_global_system_inode(type)) {
+		arr = &(osb->global_system_inodes[type]);
+	} else
+		arr = get_local_system_inode(osb, type, slot);
 
 	if (arr && ((inode = *arr) != NULL)) {
 		/* get a ref in addition to the array ref */
-- 
cgit v1.2.3


From 5e98d492406818e6a94c0ba54c61f59d40cefa4a Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Mon, 28 Jun 2010 10:04:32 -0500
Subject: Track negative entries v3

Track negative dentries by recording the generation number of the parent
directory in d_fsdata. The generation number for the parent directory is
recorded in the inode_info, which increments every time the lock on the
directory is dropped.

If the generation number of the parent directory and the negative dentry
matches, there is no need to perform the revalidate, else a revalidate
is forced. This improves performance in situations where nodes look for
the same non-existent file multiple times.

Thanks Mark for explaining the DLM sequence.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dcache.c  | 33 +++++++++++++++++++++++++++++----
 fs/ocfs2/dcache.h  |  1 +
 fs/ocfs2/dlmglue.c |  8 ++++++++
 fs/ocfs2/inode.c   |  1 +
 fs/ocfs2/inode.h   |  1 +
 fs/ocfs2/namei.c   |  3 ++-
 6 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..edaded48e7e9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
 #include "inode.h"
 #include "super.h"
 
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+	unsigned long gen =
+		OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+	BUG_ON(dentry->d_inode);
+	dentry->d_fsdata = (void *)gen;
+}
+
 
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
 				   struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 
-	/* Never trust a negative dentry - force a new lookup. */
+	/* For a negative dentry -
+	 * check the generation number of the parent and compare with the
+	 * one stored in the inode.
+	 */
 	if (inode == NULL) {
-		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
-		     dentry->d_name.name);
-		goto bail;
+		unsigned long gen = (unsigned long) dentry->d_fsdata;
+		unsigned long pgen =
+			OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+		mlog(0, "negative dentry: %.*s parent gen: %lu "
+			"dentry gen: %lu\n",
+			dentry->d_name.len, dentry->d_name.name, pgen, gen);
+		if (gen != pgen)
+			goto bail;
+		goto valid;
 	}
 
 	BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 		goto bail;
 	}
 
+valid:
 	ret = 1;
 
 bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
 	if (!inode)
 		return 0;
 
+	if (!dentry->d_inode && dentry->d_fsdata) {
+		/* Converting a negative dentry to positive
+		   Clear dentry->d_fsdata */
+		dentry->d_fsdata = dl = NULL;
+	}
+
 	if (dl) {
 		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
 				" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 out:
 	iput(inode);
+	ocfs2_dentry_attach_gen(dentry);
 }
 
 /*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
 		       struct inode *old_dir, struct inode *new_dir);
 
 extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
 
 #endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 {
 	struct inode *inode;
 	struct address_space *mapping;
+	struct ocfs2_inode_info *oi;
 
        	inode = ocfs2_lock_res_inode(lockres);
 	mapping = inode->i_mapping;
 
+	if (S_ISDIR(inode->i_mode)) {
+		oi = OCFS2_I(inode);
+		oi->ip_dir_lock_gen++;
+		mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+		goto out;
+	}
+
 	if (!S_ISREG(inode->i_mode))
 		goto out;
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eece3e05d9d0..f935fd6600dd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		    else
 			    inode->i_fop = &ocfs2_dops_no_plocks;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    OCFS2_I(inode)->ip_dir_lock_gen = 1;
 		    break;
 	    case S_IFLNK:
 		    if (ocfs2_inode_is_fast_symlink(inode))
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 0bc477a3aeb8..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -67,6 +67,7 @@ struct ocfs2_inode_info
 	/* Only valid if the inode is the dir. */
 	u32				ip_last_used_slot;
 	u64				ip_last_used_group;
+	u32				ip_dir_lock_gen;
 
 	struct ocfs2_alloc_reservation	ip_la_data_resv;
 };
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a00dda2e4f16..e7bde21149ae 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
 			ret = ERR_PTR(status);
 			goto bail_unlock;
 		}
-	}
+	} else
+		ocfs2_dentry_attach_gen(dentry);
 
 bail_unlock:
 	/* Don't drop the cluster lock until *after* the d_add --
-- 
cgit v1.2.3


From 6d1d8050b4bc89d0165d29b58e894aeba2564a97 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:05 -0500
Subject: block, partition: add partition_meta_info to hd_struct

I'm reposting this patch series as v4 since there have been no additional
comments, and I cleaned up one extra bit of unneeded code (in 3/3). The patches
are against Linus's tree: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c
(2.6.36-rc3).

Would this patchset be suitable for inclusion in an mm branch?

This changes adds a partition_meta_info struct which itself contains a
union of structures that provide partition table specific metadata.

This change leaves the union empty. The subsequent patch includes an
implementation for CONFIG_EFI_PARTITION-based metadata.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c         |  1 +
 block/ioctl.c         |  2 +-
 fs/partitions/check.c | 23 +++++++++++++++++++---
 fs/partitions/check.h |  3 +++
 include/linux/genhd.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..c8da12055264 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1004,6 +1004,7 @@ static void disk_release(struct device *dev)
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
+	free_part_info(&disk->part0);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..2c15fe0912c4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 
 			/* all seems OK */
 			part = add_partition(disk, partno, start, length,
-					     ADDPART_FLAG_NONE);
+					     ADDPART_FLAG_NONE, NULL);
 			mutex_unlock(&bdev->bd_mutex);
 			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..6dfbee03ccc6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	free_part_stats(p);
+	free_part_info(p);
 	kfree(p);
 }
 
@@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
 struct hd_struct *add_partition(struct gendisk *disk, int partno,
-				sector_t start, sector_t len, int flags)
+				sector_t start, sector_t len, int flags,
+				struct partition_meta_info *info)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
 
+	if (info) {
+		struct partition_meta_info *pinfo = alloc_part_info(disk);
+		if (!pinfo)
+			goto out_free_stats;
+		memcpy(pinfo, info, sizeof(*info));
+		p->info = pinfo;
+	}
+
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
 		dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free_stats;
+		goto out_free_info;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	return p;
 
+out_free_info:
+	free_part_info(p);
 out_free_stats:
 	free_part_stats(p);
 out_free:
@@ -642,6 +654,7 @@ rescan:
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
+		struct partition_meta_info *info = NULL;
 
 		size = state->parts[p].size;
 		if (!size)
@@ -675,8 +688,12 @@ rescan:
 				size = get_capacity(disk) - from;
 			}
 		}
+
+		if (state->parts[p].has_info)
+			info = &state->parts[p].info;
 		part = add_partition(disk, p, from, size,
-				     state->parts[p].flags);
+				     state->parts[p].flags,
+				     &state->parts[p].info);
 		if (IS_ERR(part)) {
 			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
 			       disk->disk_name, p, -PTR_ERR(part));
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/genhd.h>
 
 /*
  * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
 		sector_t from;
 		sector_t size;
 		int flags;
+		bool has_info;
+		struct partition_meta_info info;
 	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4d8fb0..66e26b5a1537 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/kdev_t.h>
 #include <linux/rcupdate.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -86,7 +87,15 @@ struct disk_stats {
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
 };
-	
+
+#define PARTITION_META_INFO_VOLNAMELTH	64
+#define PARTITION_META_INFO_UUIDLTH	16
+
+struct partition_meta_info {
+	u8 uuid[PARTITION_META_INFO_UUIDLTH];	/* always big endian */
+	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
+};
+
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
@@ -95,6 +104,7 @@ struct hd_struct {
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
+	struct partition_meta_info *info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
 #endif
@@ -181,6 +191,30 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
+static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
+{
+	int i;
+	for (i = 0; i < 16; ++i) {
+		*to++ = (hex_to_bin(*uuid_str) << 4) |
+			(hex_to_bin(*(uuid_str + 1)));
+		uuid_str += 2;
+		switch (i) {
+		case 3:
+		case 5:
+		case 7:
+		case 9:
+			uuid_str++;
+			continue;
+		}
+	}
+}
+
+static inline char *part_unpack_uuid(const u8 *uuid, char *out)
+{
+	sprintf(out, "%pU", uuid);
+	return out;
+}
+
 static inline int disk_max_parts(struct gendisk *disk)
 {
 	if (disk->flags & GENHD_FL_EXT_DEVT)
@@ -342,6 +376,19 @@ static inline int part_in_flight(struct hd_struct *part)
 	return part->in_flight[0] + part->in_flight[1];
 }
 
+static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
+{
+	if (disk)
+		return kzalloc_node(sizeof(struct partition_meta_info),
+				    GFP_KERNEL, disk->node_id);
+	return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
+}
+
+static inline void free_part_info(struct hd_struct *part)
+{
+	kfree(part->info);
+}
+
 /* block/blk-core.c */
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
@@ -533,7 +580,9 @@ extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     int partno, sector_t start,
-						     sector_t len, int flags);
+						     sector_t len, int flags,
+						     struct partition_meta_info
+						       *info);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3


From eec7ecfede74bb996060efefd5c157acd5794e8a Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:06 -0500
Subject: genhd, efi: add efi partition metadata to hd_structs

This change extends the partition_meta_info structure to
support EFI GPT-specific metadata and ensures that data
is copied in on partition scanning.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/partitions/efi.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
  *
  ************************************************************/
 #include <linux/crc32.h>
+#include <linux/ctype.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
 	gpt_entry *ptes = NULL;
 	u32 i;
 	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+	u8 unparsed_guid[37];
 
 	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
 	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		struct partition_meta_info *info;
+		unsigned label_count = 0;
+		unsigned label_max;
 		u64 start = le64_to_cpu(ptes[i].starting_lba);
 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
 		if (!efi_guidcmp(ptes[i].partition_type_guid,
 				 PARTITION_LINUX_RAID_GUID))
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+		info = &state->parts[i + 1].info;
+		/* Instead of doing a manual swap to big endian, reuse the
+		 * common ASCII hex format as the interim.
+		 */
+		efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+		part_pack_uuid(unparsed_guid, info->uuid);
+
+		/* Naively convert UTF16-LE to 7 bits. */
+		label_max = min(sizeof(info->volname) - 1,
+				sizeof(ptes[i].partition_name));
+		info->volname[label_max] = 0;
+		while (label_count < label_max) {
+			u8 c = ptes[i].partition_name[label_count] & 0xff;
+			if (c && !isprint(c))
+				c = '!';
+			info->volname[label_count] = c;
+			label_count++;
+		}
+		state->parts[i + 1].has_info = true;
 	}
 	kfree(ptes);
 	kfree(gpt);
-- 
cgit v1.2.3


From c0e1a3e80dc030cd54f06416efbf9e439bf6ecb7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 15 Sep 2010 16:56:54 -0700
Subject: ocfs2: Silence unused warning.

When CONFIG_OCFS2_DEBUG_MASKLOG is undefined, we don't use the dentry
variable in ocfs2_sync_file().  Let's just move all access to the dentry
inside the logging call.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a74542e1a05..13af9937bdda 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -174,12 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 {
 	int err = 0;
 	journal_t *journal;
-	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+		   file->f_path.dentry, file->f_path.dentry->d_name.len,
+		   file->f_path.dentry->d_name.name);
 
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
-- 
cgit v1.2.3


From 93f3b86fb1bd0ad7b4a5eb1ad1fdae2b290633b7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 15 Sep 2010 17:00:58 -0700
Subject: ocfs2: Initialize the bktcnt variable properly, and call it
 bucket_count

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37dfe48..26ff2e185b1e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 	struct hlist_head *bucket;
 	struct hlist_node *list;
 	int i, out = 0;
-	unsigned long total = 0, longest = 0, bktcnt;
+	unsigned long total = 0, longest = 0, bucket_count = 0;
 
 	out += snprintf(db->buf + out, db->len - out,
 			"Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 			mle = hlist_entry(list, struct dlm_master_list_entry,
 					  master_hash_node);
 			++total;
-			++bktcnt;
+			++bucket_count;
 			if (db->len - out < 200)
 				continue;
 			out += dump_mle(mle, db->buf + out, db->len - out);
 		}
-		longest = max(longest, bktcnt);
-		bktcnt = 0;
+		longest = max(longest, bucket_count);
+		bucket_count = 0;
 	}
 	spin_unlock(&dlm->master_lock);
 
-- 
cgit v1.2.3


From dd3932eddf428571762596e17b65f5dc92ca361b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Sep 2010 20:51:46 +0200
Subject: block: remove BLKDEV_IFL_WAIT

All the blkdev_issue_* helpers can only sanely be used for synchronous
caller.  To issue cache flushes or barriers asynchronously the caller needs
to set up a bio by itself with a completion callback to move the asynchronous
state machine ahead.  So drop the BLKDEV_IFL_WAIT flag that is always
specified when calling blkdev_issue_* and also remove the now unused flags
argument to blkdev_issue_flush and blkdev_issue_zeroout.  For
blkdev_issue_discard we need to keep it for the secure discard flag, which
gains a more descriptive name and loses the bitops vs flag confusion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c                  | 25 +++++++++++--------------
 block/blk-lib.c                    | 21 ++++++++-------------
 block/ioctl.c                      |  4 ++--
 drivers/block/drbd/drbd_int.h      |  3 +--
 drivers/block/drbd/drbd_receiver.c |  2 +-
 fs/block_dev.c                     |  2 +-
 fs/btrfs/extent-tree.c             |  3 +--
 fs/ext3/fsync.c                    |  3 +--
 fs/ext4/fsync.c                    |  5 ++---
 fs/ext4/mballoc.c                  |  3 +--
 fs/fat/fatent.c                    |  3 +--
 fs/gfs2/rgrp.c                     |  5 ++---
 fs/jbd2/checkpoint.c               |  3 +--
 fs/jbd2/commit.c                   |  6 ++----
 fs/nilfs2/the_nilfs.c              |  4 ++--
 fs/reiserfs/file.c                 |  3 +--
 fs/xfs/linux-2.6/xfs_super.c       |  3 +--
 include/linux/blkdev.h             | 14 +++++---------
 mm/swapfile.c                      |  6 +++---
 19 files changed, 47 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 62b7df9bca9d..54b123d6563e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -205,7 +205,6 @@ static void bio_end_flush(struct bio *bio, int err)
  * @bdev:	blockdev to issue flush for
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
@@ -214,7 +213,7 @@ static void bio_end_flush(struct bio *bio, int err)
  *    request was pushed in some internal queue for later handling.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-		sector_t *error_sector, unsigned long flags)
+		sector_t *error_sector)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
@@ -240,21 +239,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
-	if (test_bit(BLKDEV_WAIT, &flags))
-		bio->bi_private = &wait;
+	bio->bi_private = &wait;
 
 	bio_get(bio);
 	submit_bio(WRITE_FLUSH, bio);
-	if (test_bit(BLKDEV_WAIT, &flags)) {
-		wait_for_completion(&wait);
-		/*
-		 * The driver must store the error location in ->bi_sector, if
-		 * it supports it. For non-stacked drivers, this should be
-		 * copied from blk_rq_pos(rq).
-		 */
-		if (error_sector)
-			*error_sector = bio->bi_sector;
-	}
+	wait_for_completion(&wait);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be
+	 * copied from blk_rq_pos(rq).
+	 */
+	if (error_sector)
+               *error_sector = bio->bi_sector;
 
 	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index fe2e6ed0f510..1a320d2406b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -61,7 +61,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		max_discard_sectors &= ~(disc_sects - 1);
 	}
 
-	if (flags & BLKDEV_IFL_SECURE) {
+	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
 		type |= REQ_SECURE;
@@ -77,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &wait;
+		bio->bi_private = &wait;
 
 		if (nr_sects > max_discard_sectors) {
 			bio->bi_size = max_discard_sectors << 9;
@@ -92,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_get(bio);
 		submit_bio(type, bio);
 
-		if (flags & BLKDEV_IFL_WAIT)
-			wait_for_completion(&wait);
+		wait_for_completion(&wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
@@ -139,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
@@ -148,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+			sector_t nr_sects, gfp_t gfp_mask)
 {
 	int ret;
 	struct bio *bio;
@@ -174,8 +171,7 @@ submit:
 		bio->bi_sector = sector;
 		bio->bi_bdev   = bdev;
 		bio->bi_end_io = bio_batch_end_io;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &bb;
+		bio->bi_private = &bb;
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -193,10 +189,9 @@ submit:
 		submit_bio(WRITE, bio);
 	}
 
-	if (flags & BLKDEV_IFL_WAIT)
-		/* Wait for bios in-flight */
-		while ( issued != atomic_read(&bb.done))
-			wait_for_completion(&wait);
+	/* Wait for bios in-flight */
+	while (issued != atomic_read(&bb.done))
+		wait_for_completion(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		/* One of bios in the batch was completed with error.*/
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..cb2b9099862b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
 {
-	unsigned long flags = BLKDEV_IFL_WAIT;
+	unsigned long flags = 0;
 
 	if (start & 511)
 		return -EINVAL;
@@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
 	if (secure)
-		flags |= BLKDEV_IFL_SECURE;
+		flags |= BLKDEV_DISCARD_SECURE;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..c2ef476f5711 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2321,8 +2321,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 	if (test_bit(MD_NO_BARRIER, &mdev->flags))
 		return;
 
-	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
 	if (r) {
 		set_bit(MD_NO_BARRIER, &mdev->flags);
 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..df15e7f0e7b7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -975,7 +975,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 
 	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
 		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		if (rv) {
 			dev_err(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..b737451e2e9d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -370,7 +370,7 @@ int blkdev_fsync(struct file *filp, int datasync)
 	 */
 	mutex_unlock(&bd_inode->i_mutex);
 
-	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 43dc9ea9aef6..0b81ecdb101c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1695,8 +1695,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
 	 * storage
 	 */
 	if (needs_barrier)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..3f3ff5ee8f9d 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -128,10 +128,9 @@ int ext4_sync_file(struct file *file, int datasync)
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a22bfef3da95..19aa0d44d822 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,8 +2566,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
-			       BLKDEV_IFL_WAIT);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f9a0b7ae8648..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -578,8 +578,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
 					nr_clus * sbi->sec_per_clus,
-					GFP_NOFS,
-					BLKDEV_IFL_WAIT);
+					GFP_NOFS, 0);
 
 				first_cl = cluster;
 			}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 379316472918..38b3ea1abacc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    BLKDEV_IFL_WAIT);
+							    0);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -868,8 +868,7 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 BLKDEV_IFL_WAIT);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
 		if (rv)
 			goto fail;
 	}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6571a056e55d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -532,8 +532,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f204e27f44d1..cb43c605cfaa 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -684,8 +684,7 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -810,8 +809,7 @@ wait_for_iobuf:
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 	    journal->j_flags & JBD2_BARRIER) {
-		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-				   BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 	}
 
 	if (err)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 400b2caef4d8..d97310f07bef 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -774,7 +774,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 			ret = blkdev_issue_discard(nilfs->ns_bdev,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
-						   GFP_NOFS, BLKDEV_IFL_WAIT);
+						   GFP_NOFS, 0);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -784,7 +784,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS, BLKDEV_IFL_WAIT);
+					   GFP_NOFS, 0);
 	return ret;
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff14..5fa7a30cc3f0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -693,8 +693,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
 }
 
 STATIC void
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cfcb3a610605..accbd0e5c893 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -867,18 +867,14 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 		return NULL;
 	return bqt->tag_index[tag];
 }
-enum{
-	BLKDEV_WAIT,	/* wait for completion */
-	BLKDEV_SECURE,	/* secure discard */
-};
-#define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
-#define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
-			unsigned long);
+
+#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+			sector_t nr_sects, gfp_t gfp_mask);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 68cda164dff6..e132e1708acc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -141,7 +141,7 @@ static int discard_swap(struct swap_info_struct *si)
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+				nr_blocks, GFP_KERNEL, 0);
 		if (err)
 			return err;
 		cond_resched();
@@ -152,7 +152,7 @@ static int discard_swap(struct swap_info_struct *si)
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+				nr_blocks, GFP_KERNEL, 0);
 		if (err)
 			break;
 
@@ -191,7 +191,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
+				    nr_blocks, GFP_NOIO, 0))
 				break;
 		}
 
-- 
cgit v1.2.3


From 2680d722bf2c5f75225dd9acb3ec9e5a9e2652f4 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 17 Sep 2010 16:44:28 +0300
Subject: UBIFS: introduce new flag for RO due to errors

The R/O state may have various reasons:

1. The UBI volume is R/O
2. The FS is mounted R/O
3. The FS switched to R/O mode because of an error

However, in UBIFS we have only one variable which represents cases
1 and 3 - 'c->ro_media'. Indeed, we set this to 1 if we switch to
R/O mode due to an error, and then we test it in many places to
make sure that we stop writing as soon as the error happens.

But this is very unclean. One consequence of this, for example, is
that in 'ubifs_remount_fs()' we use 'c->ro_media' to check whether
we are in R/O mode because on an error, and we print a message
in this case. However, if we are in R/O mode because the media
is R/O, our message is bogus.

This patch introduces new flag - 'c->ro_error' which is set when
we switch to R/O mode because of an error. It also changes all
"if (c->ro_media)" checks to "if (c->ro_error)" checks, because
this is what the checks actually mean. We do not need to check
for 'c->ro_media' because if the UBI volume is in R/O mode, we
do not allow R/W mounting, and now writes can happen. This is
guaranteed by VFS. But it is good to double-check this, so this
patch also adds many "ubifs_assert(!c->ro_media)" checks.

In the 'ubifs_remount_fs()' function this patch makes a bit more
changes - it fixes the error messages as well.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/commit.c   |  4 +++-
 fs/ubifs/file.c     |  6 ++++--
 fs/ubifs/gc.c       |  3 ++-
 fs/ubifs/io.c       | 16 ++++++++++------
 fs/ubifs/journal.c  |  3 ++-
 fs/ubifs/log.c      |  4 ++--
 fs/ubifs/master.c   |  3 ++-
 fs/ubifs/misc.h     |  9 ++++++---
 fs/ubifs/shrinker.c |  2 +-
 fs/ubifs/super.c    | 14 +++++++++-----
 fs/ubifs/ubifs.h    |  4 +++-
 11 files changed, 44 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d8..712432789fb8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
 	struct ubifs_lp_stats lst;
 
 	dbg_cmt("start");
-	if (c->ro_media) {
+	ubifs_assert(!c->ro_media);
+
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_up;
 	}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 03ae894c45de..c6bc51c9f07c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+	ubifs_assert(!c->ro_media);
 
-	if (unlikely(c->ro_media))
+	if (unlikely(c->ro_error))
 		return -EROFS;
 
 	/* Try out the fast-path part first */
@@ -1440,8 +1441,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
 	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
 		i_size_read(inode));
 	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+	ubifs_assert(!c->ro_media);
 
-	if (unlikely(c->ro_media))
+	if (unlikely(c->ro_error))
 		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 396f24a30af9..d927196d730b 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -616,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	ubifs_assert_cmt_locked(c);
+	ubifs_assert(!c->ro_media);
 
 	if (ubifs_gc_should_commit(c))
 		return -EAGAIN;
 
 	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
 
-	if (c->ro_media) {
+	if (c->ro_error) {
 		ret = -EROFS;
 		goto out_unlock;
 	}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 9432431bf595..18a4b8d7c844 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
  */
 void ubifs_ro_mode(struct ubifs_info *c, int err)
 {
-	if (!c->ro_media) {
-		c->ro_media = 1;
+	if (!c->ro_error) {
+		c->ro_error = 1;
 		c->no_chk_data_crc = 0;
 		c->vfs_sb->s_flags |= MS_RDONLY;
 		ubifs_warn("switched to read-only mode, error %d", err);
@@ -359,8 +359,9 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 	ubifs_assert(!(wbuf->avail & 7));
 	ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+	ubifs_assert(!c->ro_media);
 
-	if (c->ro_media)
+	if (c->ro_error)
 		return -EROFS;
 
 	ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +441,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
 {
 	int err, i;
 
+	ubifs_assert(!c->ro_media);
 	if (!c->need_wbuf_sync)
 		return 0;
 	c->need_wbuf_sync = 0;
 
-	if (c->ro_media) {
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_timers;
 	}
@@ -519,6 +521,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
 	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+	ubifs_assert(!c->ro_media);
 
 	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
 		err = -ENOSPC;
@@ -527,7 +530,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 
 	cancel_wbuf_timer_nolock(wbuf);
 
-	if (c->ro_media)
+	if (c->ro_error)
 		return -EROFS;
 
 	if (aligned_len <= wbuf->avail) {
@@ -663,8 +666,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
 	       buf_len);
 	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
 	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+	ubifs_assert(!c->ro_media);
 
-	if (c->ro_media)
+	if (c->ro_error)
 		return -EROFS;
 
 	ubifs_prepare_node(c, buf, len, 1);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68d..a6da8aa68f37 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
 	 * better to try to allocate space at the ends of eraseblocks. This is
 	 * what the squeeze parameter does.
 	 */
+	ubifs_assert(!c->ro_media);
 	squeeze = (jhead == BASEHD);
 again:
 	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
 
-	if (c->ro_media) {
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_unlock;
 	}
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42c..a41713e2fbb3 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	mutex_lock(&c->log_mutex);
-
-	if (c->ro_media) {
+	ubifs_assert(!c->ro_media);
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_unlock;
 	}
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc0..0c818e855f59 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
 {
 	int err, lnum, offs, len;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media);
+	if (c->ro_error)
 		return -EROFS;
 
 	lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e41..5d476ba184e6 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
 {
 	int err;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media);
+	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_unmap(c->ubi, lnum);
 	if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
 {
 	int err;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media);
+	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
 	if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
 {
 	int err;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media);
+	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
 	if (err) {
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5ad..10eec8778438 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
 			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
 
 			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
-			    c->ro_media) {
+			    c->ro_media || c->ro_error) {
 				mutex_unlock(&c->umount_mutex);
 				continue;
 			}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cd5900b85d38..1cfeec56df91 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1751,10 +1751,10 @@ static void ubifs_put_super(struct super_block *sb)
 				ubifs_wbuf_sync(&c->jheads[i].wbuf);
 
 		/*
-		 * On fatal errors c->ro_media is set to 1, in which case we do
+		 * On fatal errors c->ro_error is set to 1, in which case we do
 		 * not write the master node.
 		 */
-		if (!c->ro_media) {
+		if (!c->ro_error) {
 			/*
 			 * We are being cleanly unmounted which means the
 			 * orphans were killed - indicate this in the master
@@ -1798,16 +1798,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg("cannot re-mount R/W due to prior errors");
+			return -EROFS;
+		}
 		if (c->ro_media) {
-			ubifs_msg("cannot re-mount due to prior errors");
+			ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
 			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
-		if (c->ro_media) {
-			ubifs_msg("cannot re-mount due to prior errors");
+		if (c->ro_error) {
+			ubifs_msg("cannot re-mount R/O due to prior errors");
 			return -EROFS;
 		}
 		ubifs_remount_ro(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c4dc9b18f73e..f47ebb442d1c 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1032,6 +1032,7 @@ struct ubifs_debug_info;
  * @max_leb_cnt: maximum count of logical eraseblocks
  * @old_leb_cnt: count of logical eraseblocks before re-size
  * @ro_media: the underlying UBI volume is read-only
+ * @ro_error: UBIFS switched to R/O mode because an error happened
  *
  * @dirty_pg_cnt: number of dirty pages (not used)
  * @dirty_zn_cnt: number of dirty znodes
@@ -1272,7 +1273,8 @@ struct ubifs_info {
 	int leb_cnt;
 	int max_leb_cnt;
 	int old_leb_cnt;
-	int ro_media;
+	unsigned int ro_media:1;
+	unsigned int ro_error:1;
 
 	atomic_long_t dirty_pg_cnt;
 	atomic_long_t dirty_zn_cnt;
-- 
cgit v1.2.3


From ed58b2917be24fc8603128e32d50a1378afe66e1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Sep 2010 10:54:37 -0400
Subject: NFS: Remove \t from mount debugging message

During boot, a random character is displayed instead of a tab.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..d610203d95c6 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -436,7 +436,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
 
 	for (i = 0; i < entries; i++) {
 		flavors[i] = ntohl(*p++);
-		dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
+		dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
 	}
 	*count = i;
 
-- 
cgit v1.2.3


From 60ac03685bf513f9d9b6e8e098018b35309ed326 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Sep 2010 10:54:37 -0400
Subject: NFS: Clean up NFSROOT command line parsing

Clean up: To reduce confusion, rename nfs_root_name as nfs_root_parms,
as this buffer contains more than just the name of the remote server.

Introduce documenting comments around nfs_root_setup().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..169b67907a65 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -99,7 +99,7 @@
 #define NFS_ROOT		"/tftpboot/%s"
 
 /* Parameters passed from the kernel command line */
-static char nfs_root_name[256] __initdata = "";
+static char nfs_root_parms[256] __initdata = "";
 
 /* Address of NFS server */
 static __be32 servaddr __initdata = 0;
@@ -369,7 +369,7 @@ static int __init root_nfs_init(void)
 	 * be able to use the client IP address for the remote root
 	 * directory (necessary for pure RARP booting).
 	 */
-	if (root_nfs_name(nfs_root_name) < 0 ||
+	if (root_nfs_name(nfs_root_parms) < 0 ||
 	    root_nfs_addr() < 0)
 		return -1;
 
@@ -380,23 +380,37 @@ static int __init root_nfs_init(void)
 	return 0;
 }
 
-
 /*
  *  Parse NFS server and directory information passed on the kernel
  *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
  */
 static int __init nfs_root_setup(char *line)
 {
 	ROOT_DEV = Root_NFS;
+
 	if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
-		strlcpy(nfs_root_name, line, sizeof(nfs_root_name));
+		strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
 	} else {
-		int n = strlen(line) + sizeof(NFS_ROOT) - 1;
-		if (n >= sizeof(nfs_root_name))
-			line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0';
-		sprintf(nfs_root_name, NFS_ROOT, line);
+		size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
+		if (n >= sizeof(nfs_root_parms))
+			line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
+		sprintf(nfs_root_parms, NFS_ROOT, line);
 	}
-	root_server_addr = root_nfs_parse_addr(nfs_root_name);
+
+	/*
+	 * Extract the IP address of the NFS server containing our
+	 * root file system, if one was specified.
+	 *
+	 * Note: root_nfs_parse_addr() removes the server-ip from
+	 *	 nfs_root_parms, if it exists.
+	 */
+	root_server_addr = root_nfs_parse_addr(nfs_root_parms);
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 56463e50d1fc3f070492434cea6303b35ea000de Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Sep 2010 10:54:37 -0400
Subject: NFS: Use super.c for NFSROOT mount option parsing

Replace duplicate code in NFSROOT for mounting an NFS server on '/'
with logic that uses the existing mainline text-based logic in the NFS
client.

Add documenting comments where appropriate.

Note that this means NFSROOT mounts now use the same default settings
as v2/v3 mounts done via mount(2) from user space.

  vers=3,tcp,rsize=<negotiated default>,wsize=<negotiated default>

As before, however, no version/protocol negotiation with the server is
done.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c       | 182 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nfs_fs.h |  10 +--
 init/do_mounts.c       |  12 ++--
 3 files changed, 187 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 169b67907a65..cb4a6bdca871 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
  *
  *  Allow an NFS filesystem to be mounted as root. The way this works is:
  *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
- *     (2) Handle RPC negotiation with the system which replied to RARP or
- *         was reported as a boot server by BOOTP or manually.
- *     (3) The actual mounting is done later, when init() is running.
+ *     (2) Construct the device string and the options string using DHCP
+ *         option 17 and/or kernel command line options.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
  *
  *
  *	Changes:
@@ -65,7 +66,8 @@
  *	Hua Qin		:	Support for mounting root file system via
  *				NFS over TCP.
  *	Fabian Frederick:	Option parser rebuilt (using parser lib)
-*/
+ *	Chuck Lever	:	Use super.c's text-based mount option parsing
+ */
 
 #include <linux/types.h>
 #include <linux/string.h>
@@ -101,11 +103,17 @@
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = "";
+
 /* Address of NFS server */
 static __be32 servaddr __initdata = 0;
 
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
 
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -537,7 +545,7 @@ out:
  *  Get the NFS port numbers and file handle, and return the prepared 'data'
  *  argument for mount() if everything went OK. Return NULL otherwise.
  */
-void * __init nfs_root_data(void)
+void * __init old_nfs_root_data(void)
 {
 	if (root_nfs_init() < 0
 	 || root_nfs_ports() < 0
@@ -546,3 +554,165 @@ void * __init nfs_root_data(void)
 	set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
 	return (void*)&nfs_data;
 }
+
+static int __init root_nfs_copy(char *dest, const char *src,
+				     const size_t destlen)
+{
+	if (strlcpy(dest, src, destlen) > destlen)
+		return -1;
+	return 0;
+}
+
+static int __init root_nfs_cat(char *dest, const char *src,
+				  const size_t destlen)
+{
+	if (strlcat(dest, src, destlen) > destlen)
+		return -1;
+	return 0;
+}
+
+/*
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
+ */
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
+					 const size_t exppathlen)
+{
+	char *p;
+
+	/*
+	 * Set the NFS remote path
+	 */
+	p = strsep(&incoming, ",");
+	if (*p != '\0' && strcmp(p, "default") != 0)
+		if (root_nfs_copy(exppath, p, exppathlen))
+			return -1;
+
+	/*
+	 * @incoming now points to the rest of the string; if it
+	 * contains something, append it to our root options buffer
+	 */
+	if (incoming != NULL && *incoming != '\0')
+		if (root_nfs_cat(nfs_root_options, incoming,
+						sizeof(nfs_root_options)))
+			return -1;
+
+	/*
+	 * Possibly prepare for more options to be appended
+	 */
+	if (nfs_root_options[0] != '\0' &&
+	    nfs_root_options[strlen(nfs_root_options)] != ',')
+		if (root_nfs_cat(nfs_root_options, ",",
+						sizeof(nfs_root_options)))
+			return -1;
+
+	return 0;
+}
+
+/*
+ *  Decode the export directory path name and NFS options from
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
+ */
+static int __init root_nfs_data(char *cmdline)
+{
+	char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+	int len, retval = -1;
+	char *tmp = NULL;
+	const size_t tmplen = sizeof(nfs_export_path);
+
+	tmp = kzalloc(tmplen, GFP_KERNEL);
+	if (tmp == NULL)
+		goto out_nomem;
+	strcpy(tmp, NFS_ROOT);
+
+	if (root_server_path[0] != '\0') {
+		dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+			root_server_path);
+		if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+			goto out_optionstoolong;
+	}
+
+	if (cmdline[0] != '\0') {
+		dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+		if (root_nfs_parse_options(cmdline, tmp, tmplen))
+			goto out_optionstoolong;
+	}
+
+	/*
+	 * Append mandatory options for nfsroot so they override
+	 * what has come before
+	 */
+	snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+			&servaddr);
+	if (root_nfs_cat(nfs_root_options, addr_option,
+						sizeof(nfs_root_options)))
+		goto out_optionstoolong;
+
+	/*
+	 * Set up nfs_root_device.  For NFS mounts, this looks like
+	 *
+	 *	server:/path
+	 *
+	 * At this point, utsname()->nodename contains our local
+	 * IP address or hostname, set by ipconfig.  If "%s" exists
+	 * in tmp, substitute the nodename, then shovel the whole
+	 * mess into nfs_root_device.
+	 */
+	len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+				tmp, utsname()->nodename);
+	if (len > (int)sizeof(nfs_export_path))
+		goto out_devnametoolong;
+	len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+				"%pI4:%s", &servaddr, nfs_export_path);
+	if (len > (int)sizeof(nfs_root_device))
+		goto out_devnametoolong;
+
+	retval = 0;
+
+out:
+	kfree(tmp);
+	return retval;
+out_nomem:
+	printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+	goto out;
+out_optionstoolong:
+	printk(KERN_ERR "Root-NFS: mount options string too long\n");
+	goto out;
+out_devnametoolong:
+	printk(KERN_ERR "Root-NFS: root device name too long.\n");
+	goto out;
+}
+
+/**
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
+ * @root_device: OUT: address of string containing NFSROOT device
+ * @root_data: OUT: address of string containing NFSROOT mount options
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
+ */
+int __init nfs_root_data(char **root_device, char **root_data)
+{
+#ifdef NFSROOT_DEBUG
+	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+#endif	/* NFSROOT_DEBUG */
+
+	servaddr = root_server_addr;
+	if (servaddr == htonl(INADDR_NONE)) {
+		printk(KERN_ERR "Root-NFS: no NFS server address\n");
+		return -1;
+	}
+
+	if (root_nfs_data(nfs_root_parms) < 0)
+		return -1;
+
+	*root_device = nfs_root_device;
+	*root_data = nfs_root_options;
+	return 0;
+}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 508f8cf6da37..2a18f1582fa4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -364,6 +364,7 @@ extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ct
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
 extern void nfs_fattr_init(struct nfs_fattr *fattr);
+extern unsigned long nfs_inc_attr_generation_counter(void);
 
 extern struct nfs_fattr *nfs_alloc_fattr(void);
 
@@ -379,9 +380,12 @@ static inline void nfs_free_fhandle(const struct nfs_fh *fh)
 	kfree(fh);
 }
 
+/*
+ * linux/fs/nfs/nfsroot.c
+ */
+extern int  nfs_root_data(char **root_device, char **root_data); /*__init*/
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
-extern unsigned long nfs_inc_attr_generation_counter(void);
 
 /*
  * linux/fs/nfs/file.c
@@ -584,10 +588,6 @@ nfs_fileid_to_ino_t(u64 fileid)
 	return ino;
 }
 
-/* NFS root */
-
-extern void * nfs_root_data(void);
-
 #define nfs_wait_event(clnt, wq, condition)				\
 ({									\
 	int __retval = wait_event_killable(wq, condition);		\
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 02e3ca4fc527..52387f14f9b4 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -291,13 +291,13 @@ out:
 #ifdef CONFIG_ROOT_NFS
 static int __init mount_nfs_root(void)
 {
-	void *data = nfs_root_data();
+	char *root_dev, *root_data;
 
-	create_dev("/dev/root", ROOT_DEV);
-	if (data &&
-	    do_mount_root("/dev/root", "nfs", root_mountflags, data) == 0)
-		return 1;
-	return 0;
+	if (nfs_root_data(&root_dev, &root_data) != 0)
+		return 0;
+	if (do_mount_root(root_dev, "nfs", root_mountflags, root_data) != 0)
+		return 0;
+	return 1;
 }
 #endif
 
-- 
cgit v1.2.3


From 8d2321037896aa4868a67f45b2d6ed52b579a48a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Sep 2010 10:54:37 -0400
Subject: NFS: Clean up nfsroot.c

Clean up: now that mount option parsing for nfsroot is handled
in fs/nfs/super.c, remove code in fs/nfs/nfsroot.c that is no
longer used.  This includes code that constructs the legacy
nfs_mount_data structure, and code that does a MNT call to the
server.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 419 +------------------------------------------------------
 1 file changed, 1 insertion(+), 418 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cb4a6bdca871..8e7d623173a9 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -71,22 +71,12 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_mount.h>
-#include <linux/in.h>
-#include <linux/major.h>
 #include <linux/utsname.h>
-#include <linux/inet.h>
 #include <linux/root_dev.h>
 #include <net/ipconfig.h>
-#include <linux/parser.h>
 
 #include "internal.h"
 
@@ -94,9 +84,6 @@
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
 
-/* Default port to use if server is not running a portmapper */
-#define NFS_MNT_PORT	627
-
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT		"/tftpboot/%s"
 
@@ -107,7 +94,7 @@ static char nfs_root_parms[256] __initdata = "";
 static char nfs_root_options[256] __initdata = "";
 
 /* Address of NFS server */
-static __be32 servaddr __initdata = 0;
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
 
 /* Name of directory to mount */
 static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
@@ -115,279 +102,6 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
 /* server:export path string passed to super.c */
 static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
 
-/* NFS-related data */
-static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
-static int nfs_port __initdata = 0;		/* Port to connect to for NFS */
-static int mount_port __initdata = 0;		/* Mount daemon port number */
-
-
-/***************************************************************************
-
-			     Parsing of options
-
- ***************************************************************************/
-
-enum {
-	/* Options that take integer arguments */
-	Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
-	Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
-	/* Options that take no arguments */
-	Opt_soft, Opt_hard, Opt_intr,
-	Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
-	Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
-	Opt_acl, Opt_noacl,
-	/* Error token */
-	Opt_err
-};
-
-static const match_table_t tokens __initconst = {
-	{Opt_port, "port=%u"},
-	{Opt_rsize, "rsize=%u"},
-	{Opt_wsize, "wsize=%u"},
-	{Opt_timeo, "timeo=%u"},
-	{Opt_retrans, "retrans=%u"},
-	{Opt_acregmin, "acregmin=%u"},
-	{Opt_acregmax, "acregmax=%u"},
-	{Opt_acdirmin, "acdirmin=%u"},
-	{Opt_acdirmax, "acdirmax=%u"},
-	{Opt_soft, "soft"},
-	{Opt_hard, "hard"},
-	{Opt_intr, "intr"},
-	{Opt_nointr, "nointr"},
-	{Opt_posix, "posix"},
-	{Opt_noposix, "noposix"},
-	{Opt_cto, "cto"},
-	{Opt_nocto, "nocto"},
-	{Opt_ac, "ac"},
-	{Opt_noac, "noac"},
-	{Opt_lock, "lock"},
-	{Opt_nolock, "nolock"},
-	{Opt_v2, "nfsvers=2"},
-	{Opt_v2, "v2"},
-	{Opt_v3, "nfsvers=3"},
-	{Opt_v3, "v3"},
-	{Opt_udp, "proto=udp"},
-	{Opt_udp, "udp"},
-	{Opt_tcp, "proto=tcp"},
-	{Opt_tcp, "tcp"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_err, NULL}
-	
-};
-
-/*
- *  Parse option string.
- */
-
-static int __init root_nfs_parse(char *name, char *buf)
-{
-
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-
-	if (!name)
-		return 1;
-
-	/* Set the NFS remote path */
-	p = strsep(&name, ",");
-	if (p[0] != '\0' && strcmp(p, "default") != 0)
-		strlcpy(buf, p, NFS_MAXPATHLEN);
-
-	while ((p = strsep (&name, ",")) != NULL) {
-		int token; 
-		if (!*p)
-			continue;
-		token = match_token(p, tokens, args);
-
-		/* %u tokens only. Beware if you add new tokens! */
-		if (token < Opt_soft && match_int(&args[0], &option))
-			return 0;
-		switch (token) {
-			case Opt_port:
-				nfs_port = option;
-				break;
-			case Opt_rsize:
-				nfs_data.rsize = option;
-				break;
-			case Opt_wsize:
-				nfs_data.wsize = option;
-				break;
-			case Opt_timeo:
-				nfs_data.timeo = option;
-				break;
-			case Opt_retrans:
-				nfs_data.retrans = option;
-				break;
-			case Opt_acregmin:
-				nfs_data.acregmin = option;
-				break;
-			case Opt_acregmax:
-				nfs_data.acregmax = option;
-				break;
-			case Opt_acdirmin:
-				nfs_data.acdirmin = option;
-				break;
-			case Opt_acdirmax:
-				nfs_data.acdirmax = option;
-				break;
-			case Opt_soft:
-				nfs_data.flags |= NFS_MOUNT_SOFT;
-				break;
-			case Opt_hard:
-				nfs_data.flags &= ~NFS_MOUNT_SOFT;
-				break;
-			case Opt_intr:
-			case Opt_nointr:
-				break;
-			case Opt_posix:
-				nfs_data.flags |= NFS_MOUNT_POSIX;
-				break;
-			case Opt_noposix:
-				nfs_data.flags &= ~NFS_MOUNT_POSIX;
-				break;
-			case Opt_cto:
-				nfs_data.flags &= ~NFS_MOUNT_NOCTO;
-				break;
-			case Opt_nocto:
-				nfs_data.flags |= NFS_MOUNT_NOCTO;
-				break;
-			case Opt_ac:
-				nfs_data.flags &= ~NFS_MOUNT_NOAC;
-				break;
-			case Opt_noac:
-				nfs_data.flags |= NFS_MOUNT_NOAC;
-				break;
-			case Opt_lock:
-				nfs_data.flags &= ~NFS_MOUNT_NONLM;
-				break;
-			case Opt_nolock:
-				nfs_data.flags |= NFS_MOUNT_NONLM;
-				break;
-			case Opt_v2:
-				nfs_data.flags &= ~NFS_MOUNT_VER3;
-				break;
-			case Opt_v3:
-				nfs_data.flags |= NFS_MOUNT_VER3;
-				break;
-			case Opt_udp:
-				nfs_data.flags &= ~NFS_MOUNT_TCP;
-				break;
-			case Opt_tcp:
-				nfs_data.flags |= NFS_MOUNT_TCP;
-				break;
-			case Opt_acl:
-				nfs_data.flags &= ~NFS_MOUNT_NOACL;
-				break;
-			case Opt_noacl:
-				nfs_data.flags |= NFS_MOUNT_NOACL;
-				break;
-			default:
-				printk(KERN_WARNING "Root-NFS: unknown "
-					"option: %s\n", p);
-				return 0;
-		}
-	}
-
-	return 1;
-}
-
-/*
- *  Prepare the NFS data structure and parse all options.
- */
-static int __init root_nfs_name(char *name)
-{
-	static char buf[NFS_MAXPATHLEN] __initdata;
-	char *cp;
-
-	/* Set some default values */
-	memset(&nfs_data, 0, sizeof(nfs_data));
-	nfs_port          = -1;
-	nfs_data.version  = NFS_MOUNT_VERSION;
-	nfs_data.flags    = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
-	nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.acregmin = NFS_DEF_ACREGMIN;
-	nfs_data.acregmax = NFS_DEF_ACREGMAX;
-	nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-	nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
-	strcpy(buf, NFS_ROOT);
-
-	/* Process options received from the remote server */
-	root_nfs_parse(root_server_path, buf);
-
-	/* Override them by options set on kernel command-line */
-	root_nfs_parse(name, buf);
-
-	cp = utsname()->nodename;
-	if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
-		printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
-		return -1;
-	}
-	sprintf(nfs_export_path, buf, cp);
-
-	return 1;
-}
-
-
-/*
- *  Get NFS server address.
- */
-static int __init root_nfs_addr(void)
-{
-	if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) {
-		printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
-		return -1;
-	}
-
-	snprintf(nfs_data.hostname, sizeof(nfs_data.hostname),
-		 "%pI4", &servaddr);
-	return 0;
-}
-
-/*
- *  Tell the user what's going on.
- */
-#ifdef NFSROOT_DEBUG
-static void __init root_nfs_print(void)
-{
-	printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
-		nfs_export_path, nfs_data.hostname);
-	printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
-		nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
-	printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
-		nfs_data.acregmin, nfs_data.acregmax,
-		nfs_data.acdirmin, nfs_data.acdirmax);
-	printk(KERN_NOTICE "Root-NFS:     nfsd port = %d, mountd port = %d, flags = %08x\n",
-		nfs_port, mount_port, nfs_data.flags);
-}
-#endif
-
-
-static int __init root_nfs_init(void)
-{
-#ifdef NFSROOT_DEBUG
-	nfs_debug |= NFSDBG_ROOT;
-#endif
-
-	/*
-	 * Decode the root directory path name and NFS options from
-	 * the kernel command line. This has to go here in order to
-	 * be able to use the client IP address for the remote root
-	 * directory (necessary for pure RARP booting).
-	 */
-	if (root_nfs_name(nfs_root_parms) < 0 ||
-	    root_nfs_addr() < 0)
-		return -1;
-
-#ifdef NFSROOT_DEBUG
-	root_nfs_print();
-#endif
-
-	return 0;
-}
-
 /*
  *  Parse NFS server and directory information passed on the kernel
  *  command line.
@@ -424,137 +138,6 @@ static int __init nfs_root_setup(char *line)
 
 __setup("nfsroot=", nfs_root_setup);
 
-/***************************************************************************
-
-	       Routines to actually mount the root directory
-
- ***************************************************************************/
-
-/*
- *  Construct sockaddr_in from address and port number.
- */
-static inline void
-set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
-{
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = addr;
-	sin->sin_port = port;
-}
-
-/*
- *  Query server portmapper for the port of a daemon program.
- */
-static int __init root_nfs_getport(int program, int version, int proto)
-{
-	struct sockaddr_in sin;
-
-	printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n",
-		program, version, &servaddr);
-	set_sockaddr(&sin, servaddr, 0);
-	return rpcb_getport_sync(&sin, program, version, proto);
-}
-
-
-/*
- *  Use portmapper to find mountd and nfsd port numbers if not overriden
- *  by the user. Use defaults if portmapper is not available.
- *  XXX: Is there any nfs server with no portmapper?
- */
-static int __init root_nfs_ports(void)
-{
-	int port;
-	int nfsd_ver, mountd_ver;
-	int nfsd_port, mountd_port;
-	int proto;
-
-	if (nfs_data.flags & NFS_MOUNT_VER3) {
-		nfsd_ver = NFS3_VERSION;
-		mountd_ver = NFS_MNT3_VERSION;
-		nfsd_port = NFS_PORT;
-		mountd_port = NFS_MNT_PORT;
-	} else {
-		nfsd_ver = NFS2_VERSION;
-		mountd_ver = NFS_MNT_VERSION;
-		nfsd_port = NFS_PORT;
-		mountd_port = NFS_MNT_PORT;
-	}
-
-	proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-
-	if (nfs_port < 0) {
-		if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
-			printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
-					"number from server, using default\n");
-			port = nfsd_port;
-		}
-		nfs_port = port;
-		dprintk("Root-NFS: Portmapper on server returned %d "
-			"as nfsd port\n", port);
-	}
-
-	if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
-		printk(KERN_ERR "Root-NFS: Unable to get mountd port "
-				"number from server, using default\n");
-		port = mountd_port;
-	}
-	mount_port = port;
-	dprintk("Root-NFS: mountd port is %d\n", port);
-
-	return 0;
-}
-
-
-/*
- *  Get a file handle from the server for the directory which is to be
- *  mounted.
- */
-static int __init root_nfs_get_handle(void)
-{
-	struct sockaddr_in sin;
-	unsigned int auth_flav_len = 0;
-	struct nfs_mount_request request = {
-		.sap		= (struct sockaddr *)&sin,
-		.salen		= sizeof(sin),
-		.dirpath	= nfs_export_path,
-		.version	= (nfs_data.flags & NFS_MOUNT_VER3) ?
-					NFS_MNT3_VERSION : NFS_MNT_VERSION,
-		.protocol	= (nfs_data.flags & NFS_MOUNT_TCP) ?
-					XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-		.auth_flav_len	= &auth_flav_len,
-	};
-	int status = -ENOMEM;
-
-	request.fh = nfs_alloc_fhandle();
-	if (!request.fh)
-		goto out;
-	set_sockaddr(&sin, servaddr, htons(mount_port));
-	status = nfs_mount(&request);
-	if (status < 0)
-		printk(KERN_ERR "Root-NFS: Server returned error %d "
-				"while mounting %s\n", status, nfs_export_path);
-	else {
-		nfs_data.root.size = request.fh->size;
-		memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
-	}
-	nfs_free_fhandle(request.fh);
-out:
-	return status;
-}
-
-/*
- *  Get the NFS port numbers and file handle, and return the prepared 'data'
- *  argument for mount() if everything went OK. Return NULL otherwise.
- */
-void * __init old_nfs_root_data(void)
-{
-	if (root_nfs_init() < 0
-	 || root_nfs_ports() < 0
-	 || root_nfs_get_handle() < 0)
-		return NULL;
-	set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
-	return (void*)&nfs_data;
-}
-
 static int __init root_nfs_copy(char *dest, const char *src,
 				     const size_t destlen)
 {
-- 
cgit v1.2.3


From 306a075362a288683f6346185f97dd0e06df19da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Sep 2010 10:54:37 -0400
Subject: NFS: Allow NFSROOT debugging messages to be enabled dynamically

As a convenience, introduce a kernel command line option to enable
NFSROOT debugging messages.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/nfsroot.txt | 22 ++++++++++++++++++++++
 Documentation/kernel-parameters.txt       |  5 ++++-
 fs/nfs/nfsroot.c                          | 19 +++++++++++++------
 3 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index f2430a7974e1..90c71c6f0d00 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -159,6 +159,28 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
                 Default: any
 
 
+nfsrootdebug
+
+  This parameter enables debugging messages to appear in the kernel
+  log at boot time so that administrators can verify that the correct
+  NFS mount options, server address, and root path are passed to the
+  NFS client.
+
+
+rdinit=<executable file>
+
+  To specify which file contains the program that starts system
+  initialization, administrators can use this command line parameter.
+  The default value of this parameter is "/init".  If the specified
+  file exists and the kernel can execute it, root filesystem related
+  kernel command line parameters, including `nfsroot=', are ignored.
+
+  A description of the process of mounting the root file system can be
+  found in:
+
+    Documentation/early-userspace/README
+
+
 
 
 3.) Boot Loader
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f084af0cb8e0..0fe70b2c4194 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1532,12 +1532,15 @@ and is between 256 and 4096 characters. It is defined in the file
 			1 to enable accounting
 			Default value is 0.
 
-	nfsaddrs=	[NFS]
+	nfsaddrs=	[NFS] Deprecated.  Use ip= instead.
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
 	nfsroot=	[NFS] nfs root filesystem for disk-less boxes.
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
+	nfsrootdebug	[NFS] enable nfsroot debugging messages.
+			See Documentation/filesystems/nfs/nfsroot.txt.
+
 	nfs.callback_tcpport=
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8e7d623173a9..460df3652889 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -67,6 +67,7 @@
  *				NFS over TCP.
  *	Fabian Frederick:	Option parser rebuilt (using parser lib)
  *	Chuck Lever	:	Use super.c's text-based mount option parsing
+ *	Chuck Lever	:	Add "nfsrootdebug".
  */
 
 #include <linux/types.h>
@@ -80,8 +81,6 @@
 
 #include "internal.h"
 
-/* Define this to allow debugging output */
-#undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
 
 /* Default path we try to mount. "%s" gets replaced by our IP address */
@@ -102,6 +101,18 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
 /* server:export path string passed to super.c */
 static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
 
+/*
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
+ */
+static int __init nfs_root_debug(char *__unused)
+{
+	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+	return 1;
+}
+
+__setup("nfsrootdebug", nfs_root_debug);
+
 /*
  *  Parse NFS server and directory information passed on the kernel
  *  command line.
@@ -282,10 +293,6 @@ out_devnametoolong:
  */
 int __init nfs_root_data(char **root_device, char **root_data)
 {
-#ifdef NFSROOT_DEBUG
-	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
-#endif	/* NFSROOT_DEBUG */
-
 	servaddr = root_server_addr;
 	if (servaddr == htonl(INADDR_NONE)) {
 		printk(KERN_ERR "Root-NFS: no NFS server address\n");
-- 
cgit v1.2.3


From cd9a1c0e5ac681871d64804f82291649e2a0accb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 17 Sep 2010 10:56:50 -0400
Subject: NFSv4: Clean up nfs4_atomic_open

Start moving the 'struct nameidata' dependent code out of the lower level
NFS code in preparation for the removal of open intents.

Instead of the struct nameidata, we pass down a partially initialised
struct nfs_open_context that will be fully initialised by the atomic open
upon success.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 77 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/nfs/inode.c         |  8 +++---
 fs/nfs/nfs4_fs.h       |  2 +-
 fs/nfs/nfs4proc.c      | 40 ++++++++------------------
 include/linux/nfs_fs.h |  2 ++
 5 files changed, 93 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..17529b5bc551 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -38,6 +38,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 /* #define NFS_DEBUG_VERBOSE 1 */
 
@@ -1029,9 +1030,61 @@ static int is_atomic_open(struct nameidata *nd)
 	return 1;
 }
 
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+	struct path path = {
+		.mnt = nd->path.mnt,
+		.dentry = dentry,
+	};
+	struct nfs_open_context *ctx;
+	struct rpc_cred *cred;
+	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return ERR_CAST(cred);
+	ctx = alloc_nfs_open_context(&path, cred, fmode);
+	put_rpccred(cred);
+	if (ctx == NULL)
+		return ERR_PTR(-ENOMEM);
+	return ctx;
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+	nfs_fscache_set_inode_cookie(inode, filp);
+	return 0;
+}
+
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+	struct file *filp;
+	int ret = 0;
+
+	/* If the open_intent is for execute, we have an extra check to make */
+	if (ctx->mode & FMODE_EXEC) {
+		ret = nfs_may_open(ctx->path.dentry->d_inode,
+				ctx->cred,
+				nd->intent.open.flags);
+		if (ret < 0)
+			goto out;
+	}
+	filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+	if (IS_ERR(filp))
+		ret = PTR_ERR(filp);
+	else
+		nfs_file_set_open_context(filp, ctx);
+out:
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
 static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
+	struct nfs_open_context *ctx;
+	struct iattr attr;
 	struct dentry *res = NULL;
+	int open_flags;
 	int error;
 
 	dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
@@ -1054,9 +1107,27 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		goto out;
 	}
 
+	ctx = nameidata_to_nfs_open_context(dentry, nd);
+	res = ERR_CAST(ctx);
+	if (IS_ERR(ctx))
+		goto out;
+
+	open_flags = nd->intent.open.flags;
+	if (nd->flags & LOOKUP_CREATE) {
+		attr.ia_mode = nd->intent.open.create_mode;
+		attr.ia_valid = ATTR_MODE;
+		if (!IS_POSIXACL(dir))
+			attr.ia_mode &= ~current_umask();
+	} else {
+		open_flags &= ~O_EXCL;
+		attr.ia_valid = 0;
+		BUG_ON(open_flags & O_CREAT);
+	}
+
 	/* Open the file on the server */
-	res = nfs4_atomic_open(dir, dentry, nd);
+	res = nfs4_atomic_open(dir, ctx, open_flags, &attr);
 	if (IS_ERR(res)) {
+		put_nfs_open_context(ctx);
 		error = PTR_ERR(res);
 		switch (error) {
 			/* Make a negative dentry */
@@ -1074,8 +1145,10 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 			default:
 				goto out;
 		}
-	} else if (res != NULL)
+	}
+	if (res != NULL)
 		dentry = res;
+	nfs_intent_set_file(nd, ctx);
 out:
 	return res;
 no_open:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..2f9266406afc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -623,7 +623,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	nfs_revalidate_inode(server, inode);
 }
 
-static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
+struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
 {
 	struct nfs_open_context *ctx;
 
@@ -633,6 +633,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 		path_get(&ctx->path);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
+		ctx->mode = f_mode;
 		ctx->flags = 0;
 		ctx->error = 0;
 		ctx->dir_cookie = 0;
@@ -673,7 +674,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
  */
-static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +731,10 @@ int nfs_open(struct inode *inode, struct file *filp)
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_context(&filp->f_path, cred);
+	ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return -ENOMEM;
-	ctx->mode = filp->f_mode;
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
 	nfs_fscache_set_inode_cookie(inode, filp);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..c5cc2a6aceb0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,7 +242,7 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
-extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..38c3bed2240d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2025,39 +2025,17 @@ out_close:
 }
 
 struct dentry *
-nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
-	struct path path = {
-		.mnt = nd->path.mnt,
-		.dentry = dentry,
-	};
+	struct dentry *dentry = ctx->path.dentry;
 	struct dentry *parent;
-	struct iattr attr;
-	struct rpc_cred *cred;
 	struct nfs4_state *state;
 	struct dentry *res;
-	int open_flags = nd->intent.open.flags;
-	fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
-
-	if (nd->flags & LOOKUP_CREATE) {
-		attr.ia_mode = nd->intent.open.create_mode;
-		attr.ia_valid = ATTR_MODE;
-		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current_umask();
-	} else {
-		open_flags &= ~O_EXCL;
-		attr.ia_valid = 0;
-		BUG_ON(open_flags & O_CREAT);
-	}
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return (struct dentry *)cred;
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
-	put_rpccred(cred);
+	state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
 	if (IS_ERR(state)) {
 		if (PTR_ERR(state) == -ENOENT) {
 			d_add(dentry, NULL);
@@ -2067,11 +2045,15 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		return (struct dentry *)state;
 	}
 	res = d_add_unique(dentry, igrab(state->inode));
-	if (res != NULL)
-		path.dentry = res;
-	nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
+	if (res != NULL) {
+		struct dentry *dummy = ctx->path.dentry;
+
+		ctx->path.dentry = dget(res);
+		dput(dummy);
+	}
+	ctx->state = state;
+	nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir));
 	nfs_unblock_sillyrename(parent);
-	nfs4_intent_set_file(nd, &path, state, fmode);
 	return res;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 2a18f1582fa4..61c89b4ad7c6 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -360,6 +360,8 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode);
+extern struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode);
+extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
-- 
cgit v1.2.3


From f46e0bd34ec002d0727761da52b8fd47f06d4440 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 17 Sep 2010 10:56:50 -0400
Subject: NFSv4: Further minor cleanups for nfs4_atomic_open()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 21 +++++++++++++++------
 fs/nfs/nfs4_fs.h  |  2 +-
 fs/nfs/nfs4proc.c | 28 ++++------------------------
 3 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 17529b5bc551..194676cc5b04 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1084,8 +1084,8 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	struct nfs_open_context *ctx;
 	struct iattr attr;
 	struct dentry *res = NULL;
+	struct inode *inode;
 	int open_flags;
-	int error;
 
 	dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1125,13 +1125,15 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	}
 
 	/* Open the file on the server */
-	res = nfs4_atomic_open(dir, ctx, open_flags, &attr);
-	if (IS_ERR(res)) {
+	nfs_block_sillyrename(dentry->d_parent);
+	inode = nfs4_atomic_open(dir, ctx, open_flags, &attr);
+	if (IS_ERR(inode)) {
+		nfs_unblock_sillyrename(dentry->d_parent);
 		put_nfs_open_context(ctx);
-		error = PTR_ERR(res);
-		switch (error) {
+		switch (PTR_ERR(inode)) {
 			/* Make a negative dentry */
 			case -ENOENT:
+				d_add(dentry, NULL);
 				res = NULL;
 				goto out;
 			/* This turned out not to be a regular file */
@@ -1143,13 +1145,20 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 					goto no_open;
 			/* case -EINVAL: */
 			default:
+				res = ERR_CAST(inode);
 				goto out;
 		}
 	}
-	if (res != NULL)
+	res = d_add_unique(dentry, inode);
+	if (res != NULL) {
+		dput(ctx->path.dentry);
+		ctx->path.dentry = dget(res);
 		dentry = res;
+	}
 	nfs_intent_set_file(nd, ctx);
+	nfs_unblock_sillyrename(dentry->d_parent);
 out:
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	return res;
 no_open:
 	return nfs_lookup(dir, dentry, nd);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c5cc2a6aceb0..8e1f6396d3de 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,7 +242,7 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
-extern struct dentry *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *);
+extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 38c3bed2240d..f8d41eed9a6b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2024,37 +2024,17 @@ out_close:
 	return ret;
 }
 
-struct dentry *
+struct inode *
 nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
-	struct dentry *dentry = ctx->path.dentry;
-	struct dentry *parent;
 	struct nfs4_state *state;
-	struct dentry *res;
 
-	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
-	nfs_block_sillyrename(parent);
 	state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
-	if (IS_ERR(state)) {
-		if (PTR_ERR(state) == -ENOENT) {
-			d_add(dentry, NULL);
-			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		}
-		nfs_unblock_sillyrename(parent);
-		return (struct dentry *)state;
-	}
-	res = d_add_unique(dentry, igrab(state->inode));
-	if (res != NULL) {
-		struct dentry *dummy = ctx->path.dentry;
-
-		ctx->path.dentry = dget(res);
-		dput(dummy);
-	}
+	if (IS_ERR(state))
+		return ERR_CAST(state);
 	ctx->state = state;
-	nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir));
-	nfs_unblock_sillyrename(parent);
-	return res;
+	return igrab(state->inode);
 }
 
 int
-- 
cgit v1.2.3


From b8d4caddd871758ffa156be51b4c8be82fea470d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 17 Sep 2010 10:56:51 -0400
Subject: NFSv4: Clean up nfs4_open_revalidate

Remove references to 'struct nameidata' from the low-level open_revalidate
code, and replace them with a struct nfs_open_context which will be
correctly initialised upon success.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 11 ++++++++++-
 fs/nfs/nfs4_fs.h  |  2 +-
 fs/nfs/nfs4proc.c | 23 ++++++-----------------
 3 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 194676cc5b04..196775c70948 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1169,6 +1169,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct dentry *parent = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct inode *dir;
+	struct nfs_open_context *ctx;
 	int openflags, ret = 0;
 
 	if (!is_atomic_open(nd) || d_mountpoint(dentry))
@@ -1194,12 +1195,20 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	/* We can't create new files, or truncate existing ones here */
 	openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
 
+	ctx = nameidata_to_nfs_open_context(dentry, nd);
+	ret = PTR_ERR(ctx);
+	if (IS_ERR(ctx))
+		goto out;
 	/*
 	 * Note: we're not holding inode->i_mutex and so may be racing with
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
-	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+	ret = nfs4_open_revalidate(dir, ctx, openflags);
+	if (ret == 1)
+		nfs_intent_set_file(nd, ctx);
+	else
+		put_nfs_open_context(ctx);
 out:
 	dput(parent);
 	if (!ret)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8e1f6396d3de..76ec1c6d92ac 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -243,7 +243,7 @@ extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_open_revalidate(struct inode *, struct nfs_open_context *, int);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f8d41eed9a6b..b4762463b19f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2038,21 +2038,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 }
 
 int
-nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
+nfs4_open_revalidate(struct inode *dir, struct nfs_open_context *ctx, int openflags)
 {
-	struct path path = {
-		.mnt = nd->path.mnt,
-		.dentry = dentry,
-	};
-	struct rpc_cred *cred;
 	struct nfs4_state *state;
-	fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return PTR_ERR(cred);
-	state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
-	put_rpccred(cred);
+	state = nfs4_do_open(dir, &ctx->path, ctx->mode, openflags, NULL, ctx->cred);
 	if (IS_ERR(state)) {
 		switch (PTR_ERR(state)) {
 			case -EPERM:
@@ -2065,14 +2055,13 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 				goto out_drop;
 		}
 	}
-	if (state->inode == dentry->d_inode) {
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		nfs4_intent_set_file(nd, &path, state, fmode);
+	ctx->state = state;
+	if (state->inode == ctx->path.dentry->d_inode) {
+		nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir));
 		return 1;
 	}
-	nfs4_close_sync(&path, state, fmode);
 out_drop:
-	d_drop(dentry);
+	d_drop(ctx->path.dentry);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 535918f14176396646b5547b7d1353c932f24f5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 17 Sep 2010 10:56:51 -0400
Subject: NFSv4: Further cleanups for nfs4_open_revalidate()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 32 ++++++++++++++++++++++++++------
 fs/nfs/nfs4_fs.h  |  1 -
 fs/nfs/nfs4proc.c | 28 ----------------------------
 3 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 196775c70948..dc93d356341b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1204,16 +1204,36 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
-	ret = nfs4_open_revalidate(dir, ctx, openflags);
-	if (ret == 1)
+	inode = nfs4_atomic_open(dir, ctx, openflags, NULL);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		switch (ret) {
+		case -EPERM:
+		case -EACCES:
+		case -EDQUOT:
+		case -ENOSPC:
+		case -EROFS:
+			goto out_put_ctx;
+		default:
+			goto out_drop;
+		}
+	}
+	iput(inode);
+	if (inode == dentry->d_inode) {
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		nfs_intent_set_file(nd, ctx);
-	else
-		put_nfs_open_context(ctx);
+	} else
+		goto out_drop;
 out:
 	dput(parent);
-	if (!ret)
-		d_drop(dentry);
 	return ret;
+out_drop:
+	d_drop(dentry);
+	ret = 0;
+out_put_ctx:
+	put_nfs_open_context(ctx);
+	goto out;
+
 no_open_dput:
 	dput(parent);
 no_open:
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 76ec1c6d92ac..6cf12ba9f4e7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -243,7 +243,6 @@ extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *);
-extern int nfs4_open_revalidate(struct inode *, struct nfs_open_context *, int);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b4762463b19f..83c5ef6e7cef 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2037,34 +2037,6 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 	return igrab(state->inode);
 }
 
-int
-nfs4_open_revalidate(struct inode *dir, struct nfs_open_context *ctx, int openflags)
-{
-	struct nfs4_state *state;
-
-	state = nfs4_do_open(dir, &ctx->path, ctx->mode, openflags, NULL, ctx->cred);
-	if (IS_ERR(state)) {
-		switch (PTR_ERR(state)) {
-			case -EPERM:
-			case -EACCES:
-			case -EDQUOT:
-			case -ENOSPC:
-			case -EROFS:
-				return PTR_ERR(state);
-			default:
-				goto out_drop;
-		}
-	}
-	ctx->state = state;
-	if (state->inode == ctx->path.dentry->d_inode) {
-		nfs_set_verifier(ctx->path.dentry, nfs_save_change_attribute(dir));
-		return 1;
-	}
-out_drop:
-	d_drop(ctx->path.dentry);
-	return 0;
-}
-
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 {
 	if (ctx->state == NULL)
-- 
cgit v1.2.3


From c0204fd2b8fe047b18b67e07e1bf2a03691240cd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 17 Sep 2010 10:56:51 -0400
Subject: NFS: Clean up nfs4_proc_create()

Remove all remaining references to the struct nameidata from the low level
NFS layers. Again pass down a partially initialised struct nfs_open_context
when we want to do atomic open+create.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            | 47 +++++++++++++++++++++++++++++++++++------
 fs/nfs/nfs3proc.c       |  2 +-
 fs/nfs/nfs4proc.c       | 56 +++++++++++++------------------------------------
 fs/nfs/proc.c           |  2 +-
 include/linux/nfs_xdr.h |  2 +-
 5 files changed, 58 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index dc93d356341b..e37ffddd79c2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -105,8 +105,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
 #ifdef CONFIG_NFS_V4
 
 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
 const struct inode_operations nfs4_dir_inode_operations = {
-	.create		= nfs_create,
+	.create		= nfs_open_create,
 	.lookup		= nfs_atomic_lookup,
 	.link		= nfs_link,
 	.unlink		= nfs_unlink,
@@ -1239,6 +1240,44 @@ no_open_dput:
 no_open:
 	return nfs_lookup_revalidate(dentry, nd);
 }
+
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct nfs_open_context *ctx = NULL;
+	struct iattr attr;
+	int error;
+	int open_flags = 0;
+
+	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
+
+	if ((nd->flags & LOOKUP_CREATE) != 0) {
+		open_flags = nd->intent.open.flags;
+
+		ctx = nameidata_to_nfs_open_context(dentry, nd);
+		error = PTR_ERR(ctx);
+		if (IS_ERR(ctx))
+			goto out_err;
+	}
+
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
+	if (error != 0)
+		goto out_put_ctx;
+	if (ctx != NULL)
+		nfs_intent_set_file(nd, ctx);
+	return 0;
+out_put_ctx:
+	if (ctx != NULL)
+		put_nfs_open_context(ctx);
+out_err:
+	d_drop(dentry);
+	return error;
+}
+
 #endif /* CONFIG_NFSV4 */
 
 static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
@@ -1369,7 +1408,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
 	struct iattr attr;
 	int error;
-	int open_flags = 0;
 
 	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1377,10 +1415,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
 
-	if ((nd->flags & LOOKUP_CREATE) != 0)
-		open_flags = nd->intent.open.flags;
-
-	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
 	if (error != 0)
 		goto out_err;
 	return 0;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..2be4a7f59f1c 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
  */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-		 int flags, struct nameidata *nd)
+		 int flags, struct nfs_open_context *ctx)
 {
 	struct nfs3_createdata *data;
 	mode_t mode = sattr->ia_mode;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83c5ef6e7cef..617b149ee16d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1998,32 +1998,6 @@ out:
 	return status;
 }
 
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
-{
-	struct file *filp;
-	int ret;
-
-	/* If the open_intent is for execute, we have an extra check to make */
-	if (fmode & FMODE_EXEC) {
-		ret = nfs_may_open(state->inode,
-				state->owner->so_cred,
-				nd->intent.open.flags);
-		if (ret < 0)
-			goto out_close;
-	}
-	filp = lookup_instantiate_filp(nd, path->dentry, NULL);
-	if (!IS_ERR(filp)) {
-		struct nfs_open_context *ctx;
-		ctx = nfs_file_open_context(filp);
-		ctx->state = state;
-		return 0;
-	}
-	ret = PTR_ERR(filp);
-out_close:
-	nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
-	return ret;
-}
-
 struct inode *
 nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
@@ -2491,36 +2465,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
 
 static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
-	struct path path = {
-		.mnt = nd->path.mnt,
+	struct path my_path = {
 		.dentry = dentry,
 	};
+	struct path *path = &my_path;
 	struct nfs4_state *state;
-	struct rpc_cred *cred;
-	fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
+	struct rpc_cred *cred = NULL;
+	fmode_t fmode = 0;
 	int status = 0;
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred)) {
-		status = PTR_ERR(cred);
-		goto out;
+	if (ctx != NULL) {
+		cred = ctx->cred;
+		path = &ctx->path;
+		fmode = ctx->mode;
 	}
-	state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
+	state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
-		goto out_putcred;
+		goto out;
 	}
 	d_add(dentry, igrab(state->inode));
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-	if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
-		status = nfs4_intent_set_file(nd, &path, state, fmode);
+	if (ctx != NULL)
+		ctx->state = state;
 	else
-		nfs4_close_sync(&path, state, fmode);
-out_putcred:
-	put_rpccred(cred);
+		nfs4_close_sync(path, state, fmode);
 out:
 	return status;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..4ef39ae88ea5 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
 
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-		int flags, struct nameidata *nd)
+		int flags, struct nfs_open_context *ctx)
 {
 	struct nfs_createdata *data;
 	struct rpc_message msg = {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index fc461926c412..b1484dad7bef 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1032,7 +1032,7 @@ struct nfs_rpc_ops {
 	int	(*readlink)(struct inode *, struct page *, unsigned int,
 			    unsigned int);
 	int	(*create)  (struct inode *, struct dentry *,
-			    struct iattr *, int, struct nameidata *);
+			    struct iattr *, int, struct nfs_open_context *);
 	int	(*remove)  (struct inode *, struct qstr *);
 	void	(*unlink_setup)  (struct rpc_message *, struct inode *dir);
 	int	(*unlink_done) (struct rpc_task *, struct inode *);
-- 
cgit v1.2.3


From 2b484297e48c3fbb1846fc6ea10036d9465273e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 17 Sep 2010 10:56:51 -0400
Subject: NFS: Add an 'open_context' element to struct nfs_rpc_ops

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            | 7 ++++---
 fs/nfs/nfs4_fs.h        | 1 -
 fs/nfs/nfs4proc.c       | 3 ++-
 include/linux/nfs_xdr.h | 4 ++++
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e37ffddd79c2..1e9e18813ad7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -34,7 +34,6 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 
-#include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
@@ -1127,7 +1126,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 
 	/* Open the file on the server */
 	nfs_block_sillyrename(dentry->d_parent);
-	inode = nfs4_atomic_open(dir, ctx, open_flags, &attr);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
 	if (IS_ERR(inode)) {
 		nfs_unblock_sillyrename(dentry->d_parent);
 		put_nfs_open_context(ctx);
@@ -1175,8 +1174,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 	if (!is_atomic_open(nd) || d_mountpoint(dentry))
 		goto no_open;
+
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
+
 	/* We can't create new files in nfs_open_revalidate(), so we
 	 * optimize away revalidation of negative dentries.
 	 */
@@ -1205,7 +1206,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
-	inode = nfs4_atomic_open(dir, ctx, openflags, NULL);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		switch (ret) {
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6cf12ba9f4e7..d24a8e07b5e2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,7 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
-extern struct inode *nfs4_atomic_open(struct inode *, struct nfs_open_context *, int, struct iattr *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 617b149ee16d..643abd26f2de 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1998,7 +1998,7 @@ out:
 	return status;
 }
 
-struct inode *
+static struct inode *
 nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
 	struct nfs4_state *state;
@@ -5358,6 +5358,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
 	.close_context  = nfs4_close_context,
+	.open_context	= nfs4_atomic_open,
 };
 
 /*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b1484dad7bef..6f345f8af4ae 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1065,6 +1065,10 @@ struct nfs_rpc_ops {
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
 	void	(*close_context)(struct nfs_open_context *ctx, int);
+	struct inode * (*open_context) (struct inode *dir,
+				struct nfs_open_context *ctx,
+				int open_flags,
+				struct iattr *iattr);
 };
 
 /*
-- 
cgit v1.2.3


From 920769f031a8aff87b66bdf49d1a0d0988241ef9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 17 Sep 2010 17:30:25 -0400
Subject: nfs: standardize the rename args container

Each NFS version has its own version of the rename args container.
Standardize them on a common one that's identical to the one NFSv4
uses.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c        |  8 ++++----
 fs/nfs/nfs3proc.c       | 12 +++++-------
 fs/nfs/nfs3xdr.c        | 10 +++++-----
 fs/nfs/nfs4proc.c       |  2 +-
 fs/nfs/nfs4xdr.c        |  2 +-
 fs/nfs/proc.c           | 10 ++++------
 include/linux/nfs_xdr.h | 39 ++++++++++++---------------------------
 7 files changed, 32 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..e15bc0306d0c 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 static int
 nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	p = xdr_encode_fhandle(p, args->tofh);
-	p = xdr_encode_array(p, args->toname, args->tolen);
+	p = xdr_encode_fhandle(p, args->old_dir);
+	p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+	p = xdr_encode_fhandle(p, args->new_dir);
+	p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
 	return 0;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2be4a7f59f1c..6dc4ef66f074 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -442,13 +442,11 @@ static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		 struct inode *new_dir, struct qstr *new_name)
 {
-	struct nfs3_renameargs	arg = {
-		.fromfh		= NFS_FH(old_dir),
-		.fromname	= old_name->name,
-		.fromlen	= old_name->len,
-		.tofh		= NFS_FH(new_dir),
-		.toname		= new_name->name,
-		.tolen		= new_name->len
+	struct nfs_renameargs	arg = {
+		.old_dir	= NFS_FH(old_dir),
+		.old_name	= old_name,
+		.new_dir	= NFS_FH(new_dir),
+		.new_name	= new_name,
 	};
 	struct nfs3_renameres res;
 	struct rpc_message msg = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..f385759eb35b 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -442,12 +442,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
  * Encode RENAME arguments
  */
 static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	p = xdr_encode_fhandle(p, args->tofh);
-	p = xdr_encode_array(p, args->toname, args->tolen);
+	p = xdr_encode_fhandle(p, args->old_dir);
+	p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+	p = xdr_encode_fhandle(p, args->new_dir);
+	p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
 	return 0;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 643abd26f2de..3aa13c1b8dd8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2570,7 +2570,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		struct inode *new_dir, struct qstr *new_name)
 {
 	struct nfs_server *server = NFS_SERVER(old_dir);
-	struct nfs4_rename_arg arg = {
+	struct nfs_renameargs arg = {
 		.old_dir = NFS_FH(old_dir),
 		.new_dir = NFS_FH(new_dir),
 		.old_name = old_name,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..7a098ae07a1b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1823,7 +1823,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
  * Encode RENAME request
  */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4ef39ae88ea5..0aff10c3bbce 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -370,12 +370,10 @@ nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		struct inode *new_dir, struct qstr *new_name)
 {
 	struct nfs_renameargs	arg = {
-		.fromfh		= NFS_FH(old_dir),
-		.fromname	= old_name->name,
-		.fromlen	= old_name->len,
-		.tofh		= NFS_FH(new_dir),
-		.toname		= new_name->name,
-		.tolen		= new_name->len
+		.old_dir	= NFS_FH(old_dir),
+		.old_name	= old_name,
+		.new_dir	= NFS_FH(new_dir),
+		.new_name	= new_name,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_RENAME],
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6f345f8af4ae..acb95fb27bcc 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -399,6 +399,18 @@ struct nfs_removeres {
 	struct nfs4_sequence_res 	seq_res;
 };
 
+/*
+ * Common arguments to the rename call
+ */
+struct nfs_renameargs {
+	const struct nfs_fh		*old_dir;
+	const struct nfs_fh		*new_dir;
+	const struct qstr		*old_name;
+	const struct qstr		*new_name;
+	const u32			*bitmask;
+	struct nfs4_sequence_args	seq_args;
+};
+
 /*
  * Argument struct for decode_entry function
  */
@@ -434,15 +446,6 @@ struct nfs_createargs {
 	struct iattr *		sattr;
 };
 
-struct nfs_renameargs {
-	struct nfs_fh *		fromfh;
-	const char *		fromname;
-	unsigned int		fromlen;
-	struct nfs_fh *		tofh;
-	const char *		toname;
-	unsigned int		tolen;
-};
-
 struct nfs_setattrargs {
 	struct nfs_fh *                 fh;
 	nfs4_stateid                    stateid;
@@ -586,15 +589,6 @@ struct nfs3_mknodargs {
 	dev_t			rdev;
 };
 
-struct nfs3_renameargs {
-	struct nfs_fh *		fromfh;
-	const char *		fromname;
-	unsigned int		fromlen;
-	struct nfs_fh *		tofh;
-	const char *		toname;
-	unsigned int		tolen;
-};
-
 struct nfs3_linkargs {
 	struct nfs_fh *		fromfh;
 	struct nfs_fh *		tofh;
@@ -801,15 +795,6 @@ struct nfs4_readlink_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
-struct nfs4_rename_arg {
-	const struct nfs_fh *		old_dir;
-	const struct nfs_fh *		new_dir;
-	const struct qstr *		old_name;
-	const struct qstr *		new_name;
-	const u32 *			bitmask;
-	struct nfs4_sequence_args	seq_args;
-};
-
 struct nfs4_rename_res {
 	const struct nfs_server *	server;
 	struct nfs4_change_info		old_cinfo;
-- 
cgit v1.2.3


From e8582a8b96f329083b4da29aa87bc43cc0d80dd1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 17 Sep 2010 17:31:06 -0400
Subject: nfs: standardize the rename response container

Right now, v3 and v4 have their own variants. Create a standard struct
that will work for v3 and v4. v2 doesn't get anything but a simple error
and so isn't affected by this.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c       | 16 ++++++++--------
 fs/nfs/nfs3xdr.c        |  6 +++---
 fs/nfs/nfs4proc.c       |  2 +-
 fs/nfs/nfs4xdr.c        |  2 +-
 include/linux/nfs_xdr.h | 23 +++++++++--------------
 5 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 6dc4ef66f074..bb41d88e1567 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -448,7 +448,7 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.new_dir	= NFS_FH(new_dir),
 		.new_name	= new_name,
 	};
-	struct nfs3_renameres res;
+	struct nfs_renameres res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME],
 		.rpc_argp	= &arg,
@@ -458,17 +458,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 
 	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
 
-	res.fromattr = nfs_alloc_fattr();
-	res.toattr = nfs_alloc_fattr();
-	if (res.fromattr == NULL || res.toattr == NULL)
+	res.old_fattr = nfs_alloc_fattr();
+	res.new_fattr = nfs_alloc_fattr();
+	if (res.old_fattr == NULL || res.new_fattr == NULL)
 		goto out;
 
 	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-	nfs_post_op_update_inode(old_dir, res.fromattr);
-	nfs_post_op_update_inode(new_dir, res.toattr);
+	nfs_post_op_update_inode(old_dir, res.old_fattr);
+	nfs_post_op_update_inode(new_dir, res.new_fattr);
 out:
-	nfs_free_fattr(res.toattr);
-	nfs_free_fattr(res.fromattr);
+	nfs_free_fattr(res.old_fattr);
+	nfs_free_fattr(res.new_fattr);
 	dprintk("NFS reply rename: %d\n", status);
 	return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f385759eb35b..89c40f8ec6e6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -970,14 +970,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
  * Decode RENAME reply
  */
 static int
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
 	int	status;
 
 	if ((status = ntohl(*p++)) != 0)
 		status = nfs_stat_to_errno(status);
-	p = xdr_decode_wcc_data(p, res->fromattr);
-	p = xdr_decode_wcc_data(p, res->toattr);
+	p = xdr_decode_wcc_data(p, res->old_fattr);
+	p = xdr_decode_wcc_data(p, res->new_fattr);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3aa13c1b8dd8..a3c21cc4677b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2577,7 +2577,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.new_name = new_name,
 		.bitmask = server->attr_bitmask,
 	};
-	struct nfs4_rename_res res = {
+	struct nfs_renameres res = {
 		.server = server,
 	};
 	struct rpc_message msg = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7a098ae07a1b..b0bd7efe8100 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4873,7 +4873,7 @@ out:
 /*
  * Decode RENAME response
  */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index acb95fb27bcc..9ad132e13d12 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -411,6 +411,15 @@ struct nfs_renameargs {
 	struct nfs4_sequence_args	seq_args;
 };
 
+struct nfs_renameres {
+	const struct nfs_server		*server;
+	struct nfs4_change_info		old_cinfo;
+	struct nfs_fattr		*old_fattr;
+	struct nfs4_change_info		new_cinfo;
+	struct nfs_fattr		*new_fattr;
+	struct nfs4_sequence_res	seq_res;
+};
+
 /*
  * Argument struct for decode_entry function
  */
@@ -623,11 +632,6 @@ struct nfs3_readlinkargs {
 	struct page **		pages;
 };
 
-struct nfs3_renameres {
-	struct nfs_fattr *	fromattr;
-	struct nfs_fattr *	toattr;
-};
-
 struct nfs3_linkres {
 	struct nfs_fattr *	dir_attr;
 	struct nfs_fattr *	fattr;
@@ -795,15 +799,6 @@ struct nfs4_readlink_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
-struct nfs4_rename_res {
-	const struct nfs_server *	server;
-	struct nfs4_change_info		old_cinfo;
-	struct nfs_fattr *		old_fattr;
-	struct nfs4_change_info		new_cinfo;
-	struct nfs_fattr *		new_fattr;
-	struct nfs4_sequence_res	seq_res;
-};
-
 #define NFS4_SETCLIENTID_NAMELEN	(127)
 struct nfs4_setclientid {
 	const nfs4_verifier *		sc_verifier;
-- 
cgit v1.2.3


From 779c51795bfb35c2403c924b9de90ca9356bc693 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 17 Sep 2010 17:31:30 -0400
Subject: nfs: move nfs_sillyrename to unlink.c

...since that's where most of the sillyrenaming code lives. A comment
block is added to the beginning as well to clarify how sillyrenaming
works. Also, make nfs_async_unlink static as nfs_sillyrename is the only
caller.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 70 -----------------------------------------
 fs/nfs/unlink.c        | 85 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs.h |  2 +-
 3 files changed, 85 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1e9e18813ad7..86f1d41c6078 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1498,76 +1498,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return error;
 }
 
-static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
-{
-	static unsigned int sillycounter;
-	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
-	const int      countersize = sizeof(sillycounter)*2;
-	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
-	char           silly[slen+1];
-	struct qstr    qsilly;
-	struct dentry *sdentry;
-	int            error = -EIO;
-
-	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name, 
-		atomic_read(&dentry->d_count));
-	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
-
-	/*
-	 * We don't allow a dentry to be silly-renamed twice.
-	 */
-	error = -EBUSY;
-	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
-		goto out;
-
-	sprintf(silly, ".nfs%*.*Lx",
-		fileidsize, fileidsize,
-		(unsigned long long)NFS_FILEID(dentry->d_inode));
-
-	/* Return delegation in anticipation of the rename */
-	nfs_inode_return_delegation(dentry->d_inode);
-
-	sdentry = NULL;
-	do {
-		char *suffix = silly + slen - countersize;
-
-		dput(sdentry);
-		sillycounter++;
-		sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
-
-		dfprintk(VFS, "NFS: trying to rename %s to %s\n",
-				dentry->d_name.name, silly);
-		
-		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
-		/*
-		 * N.B. Better to return EBUSY here ... it could be
-		 * dangerous to delete the file while it's in use.
-		 */
-		if (IS_ERR(sdentry))
-			goto out;
-	} while(sdentry->d_inode != NULL); /* need negative lookup */
-
-	qsilly.name = silly;
-	qsilly.len  = strlen(silly);
-	if (dentry->d_inode) {
-		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-				dir, &qsilly);
-		nfs_mark_for_revalidate(dentry->d_inode);
-	} else
-		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-				dir, &qsilly);
-	if (!error) {
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		d_move(dentry, sdentry);
-		error = nfs_async_unlink(dir, dentry);
- 		/* If we return 0 we don't unlink */
-	}
-	dput(sdentry);
-out:
-	return error;
-}
-
 /*
  * Remove a file after making sure there are no pending writes,
  * and after checking that the file has only one user. 
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..c3ce865294f1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
 #include <linux/nfs_fs.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/namei.h>
 
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
 
 struct nfs_unlinkdata {
 	struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
  * @dir: parent directory of dentry
  * @dentry: dentry to unlink
  */
-int
+static int
 nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct nfs_unlinkdata *data;
@@ -303,3 +306,83 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 	if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
 		nfs_free_unlinkdata(data);
 }
+
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+	static unsigned int sillycounter;
+	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
+	const int      countersize = sizeof(sillycounter)*2;
+	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
+	char           silly[slen+1];
+	struct qstr    qsilly;
+	struct dentry *sdentry;
+	int            error = -EIO;
+
+	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		atomic_read(&dentry->d_count));
+	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+
+	/*
+	 * We don't allow a dentry to be silly-renamed twice.
+	 */
+	error = -EBUSY;
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out;
+
+	sprintf(silly, ".nfs%*.*Lx",
+		fileidsize, fileidsize,
+		(unsigned long long)NFS_FILEID(dentry->d_inode));
+
+	/* Return delegation in anticipation of the rename */
+	nfs_inode_return_delegation(dentry->d_inode);
+
+	sdentry = NULL;
+	do {
+		char *suffix = silly + slen - countersize;
+
+		dput(sdentry);
+		sillycounter++;
+		sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+
+		dfprintk(VFS, "NFS: trying to rename %s to %s\n",
+				dentry->d_name.name, silly);
+
+		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+		/*
+		 * N.B. Better to return EBUSY here ... it could be
+		 * dangerous to delete the file while it's in use.
+		 */
+		if (IS_ERR(sdentry))
+			goto out;
+	} while (sdentry->d_inode != NULL); /* need negative lookup */
+
+	qsilly.name = silly;
+	qsilly.len  = strlen(silly);
+	error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, dir, &qsilly);
+	if (dentry->d_inode)
+		nfs_mark_for_revalidate(dentry->d_inode);
+	if (!error) {
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+		d_move(dentry, sdentry);
+		error = nfs_async_unlink(dir, dentry);
+		/* If we return 0 we don't unlink */
+	}
+	dput(sdentry);
+out:
+	return error;
+}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 61c89b4ad7c6..d929b1883644 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -485,10 +485,10 @@ extern void nfs_release_automount_timer(void);
 /*
  * linux/fs/nfs/unlink.c
  */
-extern int  nfs_async_unlink(struct inode *dir, struct dentry *dentry);
 extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
 extern void nfs_block_sillyrename(struct dentry *dentry);
 extern void nfs_unblock_sillyrename(struct dentry *dentry);
+extern int  nfs_sillyrename(struct inode *dir, struct dentry *dentry);
 
 /*
  * linux/fs/nfs/write.c
-- 
cgit v1.2.3


From d3d4152a5d59af9e13a73efa9e9c24383fbe307f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 17 Sep 2010 17:31:57 -0400
Subject: nfs: make sillyrename an async operation

A synchronous rename can be interrupted by a SIGKILL. If that happens
during a sillyrename operation, it's possible for the rename call to
be sent to the server, but the task exits before processing the
reply. If this happens, the sillyrenamed file won't get cleaned up
during nfs_dentry_iput and the server is left with a dangling .nfs* file
hanging around.

Fix this problem by turning sillyrename into an asynchronous operation
and have the task doing the sillyrename just wait on the reply. If the
task is killed before the sillyrename completes, it'll still proceed
to completion.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c       |  23 ++++++
 fs/nfs/nfs4proc.c       |  30 ++++++++
 fs/nfs/proc.c           |  19 +++++
 fs/nfs/unlink.c         | 200 +++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/nfs_xdr.h |   2 +
 5 files changed, 263 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index bb41d88e1567..4e9d941ab548 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -438,6 +438,27 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	return 1;
 }
 
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		      struct inode *new_dir)
+{
+	struct nfs_renameres *res;
+
+	if (nfs3_async_handle_jukebox(task, old_dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
 static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		 struct inode *new_dir, struct qstr *new_name)
@@ -842,6 +863,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.unlink_setup	= nfs3_proc_unlink_setup,
 	.unlink_done	= nfs3_proc_unlink_done,
 	.rename		= nfs3_proc_rename,
+	.rename_setup	= nfs3_proc_rename_setup,
+	.rename_done	= nfs3_proc_rename_done,
 	.link		= nfs3_proc_link,
 	.symlink	= nfs3_proc_symlink,
 	.mkdir		= nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a3c21cc4677b..c46e45e9b33f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2566,6 +2566,34 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	return 1;
 }
 
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_renameargs *arg = msg->rpc_argp;
+	struct nfs_renameres *res = msg->rpc_resp;
+
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+	arg->bitmask = server->attr_bitmask;
+	res->server = server;
+}
+
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+				 struct inode *new_dir)
+{
+	struct nfs_renameres *res = task->tk_msg.rpc_resp;
+
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
+	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+		return 0;
+
+	update_changeattr(old_dir, &res->old_cinfo);
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	update_changeattr(new_dir, &res->new_cinfo);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
 static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		struct inode *new_dir, struct qstr *new_name)
 {
@@ -5338,6 +5366,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.unlink_setup	= nfs4_proc_unlink_setup,
 	.unlink_done	= nfs4_proc_unlink_done,
 	.rename		= nfs4_proc_rename,
+	.rename_setup	= nfs4_proc_rename_setup,
+	.rename_done	= nfs4_proc_rename_done,
 	.link		= nfs4_proc_link,
 	.symlink	= nfs4_proc_symlink,
 	.mkdir		= nfs4_proc_mkdir,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0aff10c3bbce..e5e84aa2af17 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -365,6 +365,23 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	return 1;
 }
 
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		     struct inode *new_dir)
+{
+	if (nfs_async_handle_expired_key(task))
+		return 0;
+	nfs_mark_for_revalidate(old_dir);
+	nfs_mark_for_revalidate(new_dir);
+	return 1;
+}
+
 static int
 nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		struct inode *new_dir, struct qstr *new_name)
@@ -703,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.unlink_setup	= nfs_proc_unlink_setup,
 	.unlink_done	= nfs_proc_unlink_done,
 	.rename		= nfs_proc_rename,
+	.rename_setup	= nfs_proc_rename_setup,
+	.rename_done	= nfs_proc_rename_done,
 	.link		= nfs_proc_link,
 	.symlink	= nfs_proc_symlink,
 	.mkdir		= nfs_proc_mkdir,
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index c3ce865294f1..698b3e6367ff 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -307,6 +307,174 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 		nfs_free_unlinkdata(data);
 }
 
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		struct nfs_unlinkdata *data = dentry->d_fsdata;
+
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		spin_unlock(&dentry->d_lock);
+		nfs_free_unlinkdata(data);
+		return;
+	}
+	spin_unlock(&dentry->d_lock);
+}
+
+struct nfs_renamedata {
+	struct nfs_renameargs	args;
+	struct nfs_renameres	res;
+	struct rpc_cred		*cred;
+	struct inode		*old_dir;
+	struct dentry		*old_dentry;
+	struct nfs_fattr	old_fattr;
+	struct inode		*new_dir;
+	struct dentry		*new_dentry;
+	struct nfs_fattr	new_fattr;
+};
+
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct inode *old_dir = data->old_dir;
+	struct inode *new_dir = data->new_dir;
+
+	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+		return;
+	}
+
+	if (task->tk_status != 0) {
+		nfs_cancel_async_unlink(data->old_dentry);
+		return;
+	}
+
+	nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+	d_move(data->old_dentry, data->new_dentry);
+}
+
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+	struct nfs_renamedata	*data = calldata;
+	struct super_block *sb = data->old_dir->i_sb;
+
+	if (data->old_dentry->d_inode)
+		nfs_mark_for_revalidate(data->old_dentry->d_inode);
+
+	dput(data->old_dentry);
+	dput(data->new_dentry);
+	iput(data->old_dir);
+	iput(data->new_dir);
+	nfs_sb_deactive(sb);
+	put_rpccred(data->cred);
+	kfree(data);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->old_dir);
+
+	if (nfs4_setup_sequence(server, &data->args.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs_rename_ops = {
+	.rpc_call_done = nfs_async_rename_done,
+	.rpc_release = nfs_async_rename_release,
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_rename_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+static struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+		 struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	struct nfs_renamedata *data;
+	struct rpc_message msg = { };
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs_rename_ops,
+		.workqueue = nfsiod_workqueue,
+		.rpc_client = NFS_CLIENT(old_dir),
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return ERR_PTR(-ENOMEM);
+	task_setup_data.callback_data = data,
+
+	data->cred = rpc_lookup_cred();
+	if (IS_ERR(data->cred)) {
+		task = (struct rpc_task *)data->cred;
+		kfree(data);
+		return task;
+	}
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	msg.rpc_cred = data->cred;
+
+	/* set up nfs_renamedata */
+	data->old_dir = old_dir;
+	atomic_inc(&old_dir->i_count);
+	data->new_dir = new_dir;
+	atomic_inc(&new_dir->i_count);
+	data->old_dentry = dget(old_dentry);
+	data->new_dentry = dget(new_dentry);
+	nfs_fattr_init(&data->old_fattr);
+	nfs_fattr_init(&data->new_fattr);
+
+	/* set up nfs_renameargs */
+	data->args.old_dir = NFS_FH(old_dir);
+	data->args.old_name = &old_dentry->d_name;
+	data->args.new_dir = NFS_FH(new_dir);
+	data->args.new_name = &new_dentry->d_name;
+
+	/* set up nfs_renameres */
+	data->res.old_fattr = &data->old_fattr;
+	data->res.new_fattr = &data->new_fattr;
+
+	nfs_sb_active(old_dir->i_sb);
+
+	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		nfs_async_rename_release(data);
+
+	return task;
+}
+
 /**
  * nfs_sillyrename - Perform a silly-rename of a dentry
  * @dir: inode of directory that contains dentry
@@ -328,8 +496,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	const int      countersize = sizeof(sillycounter)*2;
 	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
 	char           silly[slen+1];
-	struct qstr    qsilly;
 	struct dentry *sdentry;
+	struct rpc_task *task;
 	int            error = -EIO;
 
 	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
@@ -371,17 +539,27 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 			goto out;
 	} while (sdentry->d_inode != NULL); /* need negative lookup */
 
-	qsilly.name = silly;
-	qsilly.len  = strlen(silly);
-	error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, dir, &qsilly);
-	if (dentry->d_inode)
-		nfs_mark_for_revalidate(dentry->d_inode);
-	if (!error) {
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		d_move(dentry, sdentry);
-		error = nfs_async_unlink(dir, dentry);
-		/* If we return 0 we don't unlink */
+	/* queue unlink first. Can't do this from rpc_release as it
+	 * has to allocate memory
+	 */
+	error = nfs_async_unlink(dir, dentry);
+	if (error)
+		goto out_dput;
+
+	/* run the rename task, undo unlink if it fails */
+	task = nfs_async_rename(dir, dir, dentry, sdentry);
+	if (IS_ERR(task)) {
+		error = -EBUSY;
+		nfs_cancel_async_unlink(dentry);
+		goto out_dput;
 	}
+
+	/* wait for the RPC task to complete, unless a SIGKILL intervenes */
+	error = rpc_wait_for_completion_task(task);
+	if (error == 0)
+		error = task->tk_status;
+	rpc_put_task(task);
+out_dput:
 	dput(sdentry);
 out:
 	return error;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9ad132e13d12..172df83ac54b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1018,6 +1018,8 @@ struct nfs_rpc_ops {
 	int	(*unlink_done) (struct rpc_task *, struct inode *);
 	int	(*rename)  (struct inode *, struct qstr *,
 			    struct inode *, struct qstr *);
+	void	(*rename_setup)  (struct rpc_message *msg, struct inode *dir);
+	int	(*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir);
 	int	(*link)    (struct inode *, struct inode *, struct qstr *);
 	int	(*symlink) (struct inode *, struct dentry *, struct page *,
 			    unsigned int, struct iattr *);
-- 
cgit v1.2.3


From 2ef13294d29bcfb306e0d360f1b97f37b647b0c0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 19 Sep 2010 18:34:26 +0300
Subject: UBIFS: introduce new flags for RO mounts

Commit 2fde99cb55fb9d9b88180512a5e8a5d939d27fec "UBIFS: mark VFS SB RO too"
introduced regression. This commit made UBIFS set the 'MS_RDONLY' flag in the
VFS superblock when it switches to R/O mode due to an error. This was done
to make VFS show the R/O UBIFS flag in /proc/mounts.

However, several places in UBIFS relied on the 'MS_RDONLY' flag and assume this
flag can only change when we re-mount. For example, 'ubifs_put_super()'.

This patch introduces new UBIFS flag - 'c->ro_mount' which changes only when
we re-mount, and preserves the way UBIFS was originally mounted (R/W or R/O).
This allows us to de-initialize UBIFS cleanly in 'ubifs_put_super()'.

This patch also changes all 'ubifs_assert(!c->ro_media)' assertions to
'ubifs_assert(!c->ro_media && !c->ro_mount)', because we never should write
anything if the FS was mounter R/O.

All the places where we test for 'MS_RDONLY' flag in the VFS SB were changed
and now we test the 'c->ro_mount' flag instead, because it preserves the
original UBIFS mount type, unlike the 'MS_RDONLY' flag.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/commit.c   |  2 +-
 fs/ubifs/file.c     |  5 ++---
 fs/ubifs/gc.c       |  2 +-
 fs/ubifs/io.c       |  9 ++++-----
 fs/ubifs/journal.c  |  2 +-
 fs/ubifs/log.c      |  4 ++--
 fs/ubifs/master.c   |  2 +-
 fs/ubifs/misc.h     |  6 +++---
 fs/ubifs/recovery.c |  8 ++++----
 fs/ubifs/replay.c   |  3 +--
 fs/ubifs/sb.c       |  9 +++------
 fs/ubifs/shrinker.c |  2 +-
 fs/ubifs/super.c    | 55 ++++++++++++++++++++++++++++-------------------------
 fs/ubifs/ubifs.h    | 13 +++++++++----
 14 files changed, 62 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 712432789fb8..02429d81ca33 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,7 @@ static int do_commit(struct ubifs_info *c)
 	struct ubifs_lp_stats lst;
 
 	dbg_cmt("start");
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (c->ro_error) {
 		err = -EROFS;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index c6bc51c9f07c..d77db7e36484 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,7 +433,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (unlikely(c->ro_error))
 		return -EROFS;
@@ -1440,8 +1440,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
 
 	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
 		i_size_read(inode));
-	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (unlikely(c->ro_error))
 		return VM_FAULT_SIGBUS; /* -EROFS */
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d927196d730b..151f10882820 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -616,7 +616,7 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	ubifs_assert_cmt_locked(c);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (ubifs_gc_should_commit(c))
 		return -EAGAIN;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 18a4b8d7c844..d82173182eeb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -356,10 +356,9 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 
 	dbg_io("LEB %d:%d, %d bytes, jhead %s",
 	       wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
-	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 	ubifs_assert(!(wbuf->avail & 7));
 	ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (c->ro_error)
 		return -EROFS;
@@ -441,7 +440,7 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
 {
 	int err, i;
 
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (!c->need_wbuf_sync)
 		return 0;
 	c->need_wbuf_sync = 0;
@@ -521,7 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
 	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
 		err = -ENOSPC;
@@ -666,7 +665,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
 	       buf_len);
 	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
 	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (c->ro_error)
 		return -EROFS;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a6da8aa68f37..914f1bd89e57 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,7 +122,7 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
 	 * better to try to allocate space at the ends of eraseblocks. This is
 	 * what the squeeze parameter does.
 	 */
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	squeeze = (jhead == BASEHD);
 again:
 	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index a41713e2fbb3..4d0cb1241460 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
 		jhead = &c->jheads[bud->jhead];
 		list_add_tail(&bud->list, &jhead->buds_list);
 	} else
-		ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+		ubifs_assert(c->replaying && c->ro_mount);
 
 	/*
 	 * Note, although this is a new bud, we anyway account this space now,
@@ -223,7 +223,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	mutex_lock(&c->log_mutex);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->ro_error) {
 		err = -EROFS;
 		goto out_unlock;
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 0c818e855f59..21f47afdacff 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,7 @@ int ubifs_write_master(struct ubifs_info *c)
 {
 	int err, lnum, offs, len;
 
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->ro_error)
 		return -EROFS;
 
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 5d476ba184e6..c3de04dc952a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,7 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
 {
 	int err;
 
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_unmap(c->ubi, lnum);
@@ -160,7 +160,7 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
 {
 	int err;
 
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
@@ -188,7 +188,7 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
 {
 	int err;
 
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index daae9e1f5382..c902a5de90ae 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
 
 	memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
 
-	if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+	if (c->ro_mount) {
 		/* Read-only mode. Keep a copy for switching to rw mode */
 		c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
 		if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		endpt = snod->offs + snod->len;
 	}
 
-	if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+	if (c->ro_mount && !c->remounting_rw) {
 		/* Add to recovery list */
 		struct ubifs_unclean_leb *ucleb;
 
@@ -883,7 +883,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
 {
 	int err;
 
-	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+	ubifs_assert(!c->ro_mount || c->remounting_rw);
 
 	dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
 	err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1461,7 +1461,7 @@ int ubifs_recover_size(struct ubifs_info *c)
 			}
 		}
 		if (e->exists && e->i_size < e->d_size) {
-			if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+			if (!e->inode && c->ro_mount) {
 				/* Fix the inode size and pin it in memory */
 				struct inode *inode;
 
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3e..730598cf6342 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
 	ubifs_assert(sleb->endpt - offs >= used);
 	ubifs_assert(sleb->endpt % c->min_io_size == 0);
 
-	if (sleb->endpt + c->min_io_size <= c->leb_size &&
-	    !(c->vfs_sb->s_flags & MS_RDONLY))
+	if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
 		err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
 					     sleb->endpt, UBI_SHORTTERM);
 
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9dd..bf31b4729e51 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	 * due to the unavailability of time-travelling equipment.
 	 */
 	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-		struct super_block *sb = c->vfs_sb;
-		int mounting_ro = sb->s_flags & MS_RDONLY;
-
-		ubifs_assert(!c->ro_media || mounting_ro);
-		if (!mounting_ro ||
+		ubifs_assert(!c->ro_media || c->ro_mount);
+		if (!c->ro_mount ||
 		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
 			ubifs_err("on-flash format version is w%d/r%d, but "
 				  "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->old_leb_cnt = c->leb_cnt;
 	if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
 		c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
-		if (c->vfs_sb->s_flags & MS_RDONLY)
+		if (c->ro_mount)
 			dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
 				c->old_leb_cnt,	c->leb_cnt);
 		else {
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 10eec8778438..46961c003236 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
 			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
 
 			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
-			    c->ro_media || c->ro_error) {
+			    c->ro_mount || c->ro_error) {
 				mutex_unlock(&c->umount_mutex);
 				continue;
 			}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1cfeec56df91..4e5bf3fbf782 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1137,11 +1137,11 @@ static int check_free_space(struct ubifs_info *c)
  */
 static int mount_ubifs(struct ubifs_info *c)
 {
-	struct super_block *sb = c->vfs_sb;
-	int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
+	int err;
 	long long x;
 	size_t sz;
 
+	c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
 	err = init_constants_early(c);
 	if (err)
 		return err;
@@ -1154,7 +1154,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_free;
 
-	if (c->empty && (mounted_read_only || c->ro_media)) {
+	if (c->empty && (c->ro_mount || c->ro_media)) {
 		/*
 		 * This UBI volume is empty, and read-only, or the file system
 		 * is mounted read-only - we cannot format it.
@@ -1165,7 +1165,7 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_free;
 	}
 
-	if (c->ro_media && !mounted_read_only) {
+	if (c->ro_media && !c->ro_mount) {
 		ubifs_err("cannot mount read-write - read-only media");
 		err = -EROFS;
 		goto out_free;
@@ -1185,7 +1185,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (!c->sbuf)
 		goto out_free;
 
-	if (!mounted_read_only) {
+	if (!c->ro_mount) {
 		c->ileb_buf = vmalloc(c->leb_size);
 		if (!c->ileb_buf)
 			goto out_free;
@@ -1228,7 +1228,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	}
 
 	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
-	if (!mounted_read_only) {
+	if (!c->ro_mount) {
 		err = alloc_wbufs(c);
 		if (err)
 			goto out_cbuf;
@@ -1254,12 +1254,12 @@ static int mount_ubifs(struct ubifs_info *c)
 	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
 		ubifs_msg("recovery needed");
 		c->need_recovery = 1;
-		if (!mounted_read_only) {
+		if (!c->ro_mount) {
 			err = ubifs_recover_inl_heads(c, c->sbuf);
 			if (err)
 				goto out_master;
 		}
-	} else if (!mounted_read_only) {
+	} else if (!c->ro_mount) {
 		/*
 		 * Set the "dirty" flag so that if we reboot uncleanly we
 		 * will notice this immediately on the next mount.
@@ -1270,7 +1270,7 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_master;
 	}
 
-	err = ubifs_lpt_init(c, 1, !mounted_read_only);
+	err = ubifs_lpt_init(c, 1, !c->ro_mount);
 	if (err)
 		goto out_lpt;
 
@@ -1285,11 +1285,11 @@ static int mount_ubifs(struct ubifs_info *c)
 	/* Calculate 'min_idx_lebs' after journal replay */
 	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 
-	err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+	err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
 	if (err)
 		goto out_orphans;
 
-	if (!mounted_read_only) {
+	if (!c->ro_mount) {
 		int lnum;
 
 		err = check_free_space(c);
@@ -1351,7 +1351,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	spin_unlock(&ubifs_infos_lock);
 
 	if (c->need_recovery) {
-		if (mounted_read_only)
+		if (c->ro_mount)
 			ubifs_msg("recovery deferred");
 		else {
 			c->need_recovery = 0;
@@ -1378,7 +1378,7 @@ static int mount_ubifs(struct ubifs_info *c)
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
-	if (mounted_read_only)
+	if (c->ro_mount)
 		ubifs_msg("mounted read-only");
 	x = (long long)c->main_lebs * c->leb_size;
 	ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1640,7 +1640,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	}
 
 	dbg_gen("re-mounted read-write");
-	c->vfs_sb->s_flags &= ~MS_RDONLY;
+	c->ro_mount = 0;
 	c->remounting_rw = 0;
 	c->always_chk_crc = 0;
 	err = dbg_check_space_info(c);
@@ -1676,7 +1676,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
-	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+	ubifs_assert(!c->ro_mount);
 
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
@@ -1704,6 +1704,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
+	c->ro_mount = 1;
 	err = dbg_check_space_info(c);
 	if (err)
 		ubifs_ro_mode(c, err);
@@ -1735,7 +1736,7 @@ static void ubifs_put_super(struct super_block *sb)
 	 * the mutex is locked.
 	 */
 	mutex_lock(&c->umount_mutex);
-	if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+	if (!c->ro_mount) {
 		/*
 		 * First of all kill the background thread to make sure it does
 		 * not interfere with un-mounting and freeing resources.
@@ -1745,23 +1746,23 @@ static void ubifs_put_super(struct super_block *sb)
 			c->bgt = NULL;
 		}
 
-		/* Synchronize write-buffers */
-		if (c->jheads)
-			for (i = 0; i < c->jhead_cnt; i++)
-				ubifs_wbuf_sync(&c->jheads[i].wbuf);
-
 		/*
 		 * On fatal errors c->ro_error is set to 1, in which case we do
 		 * not write the master node.
 		 */
 		if (!c->ro_error) {
+			int err;
+
+			/* Synchronize write-buffers */
+			if (c->jheads)
+				for (i = 0; i < c->jhead_cnt; i++)
+					ubifs_wbuf_sync(&c->jheads[i].wbuf);
+
 			/*
 			 * We are being cleanly unmounted which means the
 			 * orphans were killed - indicate this in the master
 			 * node. Also save the reserved GC LEB number.
 			 */
-			int err;
-
 			c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
 			c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
 			c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1797,7 +1798,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
-	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+	if (c->ro_mount && !(*flags & MS_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg("cannot re-mount R/W due to prior errors");
 			return -EROFS;
@@ -1809,7 +1810,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
 		if (c->ro_error) {
 			ubifs_msg("cannot re-mount R/O due to prior errors");
 			return -EROFS;
@@ -2068,9 +2069,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	if (sb->s_root) {
+		struct ubifs_info *c1 = sb->s_fs_info;
+
 		/* A new mount point for already mounted UBIFS */
 		dbg_gen("this ubi volume is already mounted");
-		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
 			err = -EBUSY;
 			goto out_deact;
 		}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f47ebb442d1c..381d6b207a52 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1032,6 +1032,7 @@ struct ubifs_debug_info;
  * @max_leb_cnt: maximum count of logical eraseblocks
  * @old_leb_cnt: count of logical eraseblocks before re-size
  * @ro_media: the underlying UBI volume is read-only
+ * @ro_mount: the file-system was mounted as read-only
  * @ro_error: UBIFS switched to R/O mode because an error happened
  *
  * @dirty_pg_cnt: number of dirty pages (not used)
@@ -1173,11 +1174,14 @@ struct ubifs_debug_info;
  * @replay_sqnum: sequence number of node currently being replayed
  * @need_recovery: file-system needs recovery
  * @replaying: set to %1 during journal replay
- * @unclean_leb_list: LEBs to recover when mounting ro to rw
- * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
+ *                    mode
+ * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
+ *                  FS to R/W mode
  * @size_tree: inode size information for recovery
- * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
- * @always_chk_crc: always check CRCs (while mounting and remounting rw)
+ * @remounting_rw: set while re-mounting from R/O mode to R/W mode
+ * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
+ *                  mode)
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg: debugging-related information
@@ -1274,6 +1278,7 @@ struct ubifs_info {
 	int max_leb_cnt;
 	int old_leb_cnt;
 	unsigned int ro_media:1;
+	unsigned int ro_mount:1;
 	unsigned int ro_error:1;
 
 	atomic_long_t dirty_pg_cnt;
-- 
cgit v1.2.3


From ff8f33c8b30d7b7efdcf2548c7f6e64db6a89b29 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 11 Aug 2010 09:37:53 +0100
Subject: GFS2: New truncate sequence

This updates GFS2's truncate code to use the new truncate
sequence correctly. This is a stepping stone to being
able to remove ip->i_disksize in favour of using i_size
everywhere now that the two sizes are always identical.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/gfs2/aops.c      |  10 +--
 fs/gfs2/bmap.c      | 247 +++++++++++++++++++++++++---------------------------
 fs/gfs2/bmap.h      |  20 +++--
 fs/gfs2/ops_inode.c |  26 +-----
 4 files changed, 135 insertions(+), 168 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..f687f25fb7ff 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -696,13 +696,11 @@ out:
 
 	page_cache_release(page);
 
-	/*
-	 * XXX(truncate): the call below should probably be replaced with
-	 * a call to the gfs2-specific truncate blocks helper to actually
-	 * release disk blocks..
-	 */
+	gfs2_trans_end(sdp);
 	if (pos + len > ip->i_inode.i_size)
-		truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
+		gfs2_trim_blocks(&ip->i_inode);
+	goto out_trans_fail;
+
 out_endtrans:
 	gfs2_trans_end(sdp);
 out_trans_fail:
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..20b971ad4973 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
  * @ip: the inode
  * @dibh: the dinode buffer
  * @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
  *
  * Returns: errno
  */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 /**
  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
  * @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
- * @private: private data for the unstuffer
+ * @page: The (optional) page. This is looked up if the @page is NULL
  *
  * This routine unstuffs a dinode and returns it to a "normal" state such
  * that the height can be grown in the traditional way.
@@ -884,84 +883,15 @@ out:
 	return error;
 }
 
-/**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al;
-	struct buffer_head *dibh;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_lock_check(ip);
-	if (error)
-		goto out;
-
-	al->al_requested = sdp->sd_max_height + RES_DATA;
-
-	error = gfs2_inplace_reserve(ip);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_trans_begin(sdp,
-			sdp->sd_max_height + al->al_rgd->rd_length +
-			RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-	if (error)
-		goto out_ipres;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out_end_trans;
-
-	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-		if (gfs2_is_stuffed(ip)) {
-			error = gfs2_unstuff_dinode(ip, NULL);
-			if (error)
-				goto out_brelse;
-		}
-	}
-
-	ip->i_disksize = size;
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-
-out_brelse:
-	brelse(dibh);
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_ipres:
-	gfs2_inplace_release(ip);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-
 /**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
  */
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	loff_t from = inode->i_size;
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
 	return err;
 }
 
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
 	struct buffer_head *dibh;
 	int journaled = gfs2_is_jdata(ip);
 	int error;
@@ -1039,31 +971,27 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 	if (error)
 		goto out;
 
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
 	if (gfs2_is_stuffed(ip)) {
-		u64 dsize = size + sizeof(struct gfs2_dinode);
-		ip->i_disksize = size;
-		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		if (dsize > dibh->b_size)
-			dsize = dibh->b_size;
-		gfs2_buffer_clear_tail(dibh, dsize);
-		error = 1;
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
 	} else {
-		if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
-			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
-
-		if (!error) {
-			ip->i_disksize = size;
-			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-			ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			gfs2_dinode_out(ip, dibh->b_data);
+		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+			error = gfs2_block_truncate_page(mapping, newsize);
+			if (error)
+				goto out_brelse;
 		}
+		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 	}
 
-	brelse(dibh);
+	i_size_write(inode, newsize);
+	ip->i_disksize = newsize;
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
 
+	truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+	brelse(dibh);
 out:
 	gfs2_trans_end(sdp);
 	return error;
@@ -1143,86 +1071,149 @@ out:
 
 /**
  * do_shrink - make a file smaller
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
  *
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
  *
  * Returns: errno
  */
 
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	int error;
 
-	error = trunc_start(ip, size);
+	error = trunc_start(inode, oldsize, newsize);
 	if (error < 0)
 		return error;
-	if (error > 0)
+	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	error = trunc_dealloc(ip, size);
-	if (!error)
+	error = trunc_dealloc(ip, newsize);
+	if (error == 0)
 		error = trunc_end(ip);
 
 	return error;
 }
 
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 size = inode->i_size;
+	int ret;
+
+	ret = do_shrink(inode, size, size);
+	WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
+	struct gfs2_alloc *al = NULL;
 	int error;
 
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (gfs2_is_stuffed(ip) &&
+	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+		al = gfs2_alloc_get(ip);
+		if (al == NULL)
+			return -ENOMEM;
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto do_grow_alloc_put;
+
+		al->al_requested = 1;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto do_grow_qunlock;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
 	if (error)
-		return error;
+		goto do_grow_release;
 
-	down_write(&ip->i_rw_mutex);
+	if (al) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			goto do_end_trans;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
-		goto do_touch_out;
+		goto do_end_trans;
 
+	i_size_write(inode, size);
+	ip->i_disksize = size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
-do_touch_out:
-	up_write(&ip->i_rw_mutex);
+do_end_trans:
 	gfs2_trans_end(sdp);
+do_grow_release:
+	if (al) {
+		gfs2_inplace_release(ip);
+do_grow_qunlock:
+		gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+		gfs2_alloc_put(ip);
+	}
 	return error;
 }
 
 /**
- * gfs2_truncatei - make a file a given size
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
  *
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
  *
  * Returns: errno
  */
 
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
-	int error;
+	int ret;
+	u64 oldsize;
 
-	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
-		return -EINVAL;
+	BUG_ON(!S_ISREG(inode->i_mode));
 
-	if (size > ip->i_disksize)
-		error = do_grow(ip, size);
-	else if (size < ip->i_disksize)
-		error = do_shrink(ip, size);
-	else
-		/* update time stamps */
-		error = do_touch(ip, size);
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
 
-	return error;
+	oldsize = inode->i_size;
+	if (newsize >= oldsize)
+		return do_grow(inode, newsize);
+
+	return do_shrink(inode, oldsize, newsize);
 }
 
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 	}
 }
 
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
-
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-			      unsigned int len);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+			  struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+			   u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+				     unsigned int len);
 
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..1d3f2fb466bd 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,30 +1071,6 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
-/*
- * XXX(truncate): the truncate_setsize calls should be moved to the end.
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int error;
-
-	if (attr->ia_size != ip->i_disksize) {
-		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-		if (error)
-			return error;
-		truncate_setsize(inode, attr->ia_size);
-		gfs2_trans_end(sdp);
-	}
-
-	error = gfs2_truncatei(ip, attr->ia_size);
-	if (error && (inode->i_size != ip->i_disksize))
-		i_size_write(inode, ip->i_disksize);
-
-	return error;
-}
-
 static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1171,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 
 	if (attr->ia_valid & ATTR_SIZE)
-		error = setattr_size(inode, attr);
+		error = gfs2_setattr_size(inode, attr->ia_size);
 	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
 		error = setattr_chown(inode, attr);
 	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
-- 
cgit v1.2.3


From a2e0f79939e09e74698564b88dee709db208e1e2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 11 Aug 2010 09:53:11 +0100
Subject: GFS2: Remove i_disksize

With the update of the truncate code, ip->i_disksize and
inode->i_size are merely copies of each other. This means
we can remove ip->i_disksize and use inode->i_size exclusively
reducing the size of a GFS2 inode by 8 bytes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c       |  6 +-----
 fs/gfs2/bmap.c       | 12 +++++-------
 fs/gfs2/dir.c        | 28 +++++++++++++++-------------
 fs/gfs2/file.c       |  2 +-
 fs/gfs2/glops.c      |  5 ++---
 fs/gfs2/incore.h     |  1 -
 fs/gfs2/inode.c      |  9 ++++-----
 fs/gfs2/inode.h      | 13 +++++++++++++
 fs/gfs2/ops_fstype.c |  2 +-
 fs/gfs2/ops_inode.c  | 10 +++++-----
 fs/gfs2/quota.c      | 13 +++++--------
 fs/gfs2/rgrp.c       |  6 +++---
 fs/gfs2/super.c      | 11 +++++------
 13 files changed, 60 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f687f25fb7ff..c92f36ba3fc9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -800,10 +800,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	page_cache_release(page);
 
 	if (copied) {
-		if (inode->i_size < to) {
+		if (inode->i_size < to)
 			i_size_write(inode, to);
-			ip->i_disksize = inode->i_size;
-		}
 		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
@@ -874,8 +872,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 	if (ret > 0) {
-		if (inode->i_size > ip->i_disksize)
-			ip->i_disksize = inode->i_size;
 		gfs2_dinode_out(ip, dibh->b_data);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 20b971ad4973..04513e997df6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -131,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		goto out;
 
-	if (ip->i_disksize) {
+	if (i_size_read(&ip->i_inode)) {
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 
@@ -160,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
-	if (ip->i_disksize) {
+	if (i_size_read(&ip->i_inode)) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -985,7 +985,6 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 	}
 
 	i_size_write(inode, newsize);
-	ip->i_disksize = newsize;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -1051,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
 	if (error)
 		goto out;
 
-	if (!ip->i_disksize) {
+	if (!i_size_read(&ip->i_inode)) {
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1167,7 +1166,6 @@ static int do_grow(struct inode *inode, u64 size)
 		goto do_end_trans;
 
 	i_size_write(inode, size);
-	ip->i_disksize = size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1219,7 +1217,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, ip->i_disksize);
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1260,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 
 	shift = sdp->sd_sb.sb_bsize_shift;
 	BUG_ON(gfs2_is_dir(ip));
-	end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
 	lblock = offset >> shift;
 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
 	if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..c1042ae438cc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -127,8 +127,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-	if (ip->i_disksize < offset + size)
-		ip->i_disksize = offset + size;
+	if (ip->i_inode.i_size < offset + size)
+		i_size_write(&ip->i_inode, offset + size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -225,8 +225,8 @@ out:
 	if (error)
 		return error;
 
-	if (ip->i_disksize < offset + copied)
-		ip->i_disksize = offset + copied;
+	if (ip->i_inode.i_size < offset + copied)
+		i_size_write(&ip->i_inode, offset + copied);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +275,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 	unsigned int o;
 	int copied = 0;
 	int error = 0;
+	u64 disksize = i_size_read(&ip->i_inode);
 
-	if (offset >= ip->i_disksize)
+	if (offset >= disksize)
 		return 0;
 
-	if (offset + size > ip->i_disksize)
-		size = ip->i_disksize - offset;
+	if (offset + size > disksize)
+		size = disksize - offset;
 
 	if (!size)
 		return 0;
@@ -727,7 +728,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
 		u64 ln;
-		if (hsize * sizeof(u64) != ip->i_disksize) {
+		if (hsize * sizeof(u64) != i_size_read(inode)) {
 			gfs2_consist_inode(ip);
 			return ERR_PTR(-EIO);
 		}
@@ -879,7 +880,7 @@ static int dir_make_exhash(struct inode *inode)
 	for (x = sdp->sd_hash_ptrs; x--; lp++)
 		*lp = cpu_to_be64(bn);
 
-	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+	i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
 	gfs2_add_inode_blocks(&dip->i_inode, 1);
 	dip->i_diskflags |= GFS2_DIF_EXHASH;
 
@@ -1057,11 +1058,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	u64 *buf;
 	u64 *from, *to;
 	u64 block;
+	u64 disksize = i_size_read(&dip->i_inode);
 	int x;
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1072,7 +1074,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	if (!buf)
 		return -ENOMEM;
 
-	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+	for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
 		error = gfs2_dir_read_data(dip, (char *)buf,
 					    block * sdp->sd_hash_bsize,
 					    sdp->sd_hash_bsize, 1);
@@ -1370,7 +1372,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != i_size_read(inode)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1784,7 +1786,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..daadcd2e755f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -491,7 +491,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail;
 
 		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_disksize > MAX_NON_LFS) {
+		    i_size_read(inode) > MAX_NON_LFS) {
 			error = -EOVERFLOW;
 			goto fail_gunlock;
 		}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..621d80e8fb2a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
 		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
 		  (unsigned int)ip->i_diskflags,
-		  (unsigned long long)ip->i_inode.i_size,
-		  (unsigned long long)ip->i_disksize);
+		  (unsigned long long)i_size_read(&ip->i_inode));
 	return 0;
 }
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..c11971775275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -267,7 +267,6 @@ struct gfs2_inode {
 	u64 i_no_formal_ino;
 	u64 i_generation;
 	u64 i_eattr;
-	loff_t i_disksize;
 	unsigned long i_flags;		/* GIF_... */
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..06370f8bd8cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	 * to do that.
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-	ip->i_disksize = be64_to_cpu(str->di_size);
-	i_size_write(&ip->i_inode, ip->i_disksize);
+	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-	str->di_size = cpu_to_be64(ip->i_disksize);
+	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	       (unsigned long long)ip->i_no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n",
 	       (unsigned long long)ip->i_no_addr);
-	printk(KERN_INFO "  i_disksize = %llu\n",
-	       (unsigned long long)ip->i_disksize);
+	printk(KERN_INFO "  i_size = %llu\n",
+	       (unsigned long long)i_size_read(&ip->i_inode));
 	printk(KERN_INFO "  blocks = %llu\n",
 	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..15ff4df20aab 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -80,6 +80,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 	dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
 }
 
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+						u64 minsize, u64 maxsize)
+{
+	u64 size = i_size_read(inode);
+	if (size < minsize || size > maxsize)
+		goto err;
+	if (size & ((1 << inode->i_blkbits) - 1))
+		goto err;
+	return 0;
+err:
+	gfs2_consist_inode(GFS2_I(inode));
+	return -EIO;
+}
 
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..5b5c87dfbfee 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -586,7 +586,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 	prev_db = 0;
 
-	for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+	for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
 		bh.b_state = 0;
 		bh.b_blocknr = 0;
 		bh.b_size = 1 << ip->i_inode.i_blkbits;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1d3f2fb466bd..ee6ffd590418 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -406,7 +406,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 	ip = ghs[1].gh_gl->gl_object;
 
-	ip->i_disksize = size;
 	i_size_write(inode, size);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +460,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip = ghs[1].gh_gl->gl_object;
 
 	ip->i_inode.i_nlink = 2;
-	ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
 	ip->i_diskflags |= GFS2_DIF_JDATA;
 	ip->i_entries = 2;
 
@@ -990,7 +989,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
-	unsigned int x;
+	unsigned int x, size;
 	char *buf;
 	int error;
 
@@ -1002,7 +1001,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 		return NULL;
 	}
 
-	if (!ip->i_disksize) {
+	size = (unsigned int)i_size_read(&ip->i_inode);
+	if (size == 0) {
 		gfs2_consist_inode(ip);
 		buf = ERR_PTR(-EIO);
 		goto out;
@@ -1014,7 +1014,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto out;
 	}
 
-	x = ip->i_disksize + 1;
+	x = size + 1;
 	buf = kmalloc(x, GFP_NOFS);
 	if (!buf)
 		buf = ERR_PTR(-ENOMEM);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..9bc6dd9a5443 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
 		goto out;
 
 	size = loc + sizeof(struct gfs2_quota);
-	if (size > inode->i_size) {
-		ip->i_disksize = size;
+	if (size > inode->i_size)
 		i_size_write(inode, size);
-	}
 	inode->i_mtime = inode->i_atime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-	unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+	u64 size = i_size_read(sdp->sd_qc_inode);
+	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
 
-	if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-	    ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
-		gfs2_consist_inode(ip);
+	if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
 		return -EIO;
-	}
+
 	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
 	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..370c29b536ea 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+		if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
@@ -588,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
-	u64 rgrp_count = ip->i_disksize;
+	u64 rgrp_count = i_size_read(inode);
 	int error;
 
 	do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -628,7 +628,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
 		/* Ignore partials */
 		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    ip->i_disksize)
+		    i_size_read(inode))
 			break;
 		error = read_rindex_entry(ip, &ra_state);
 		if (error) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..e031fa4965a3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -342,15 +342,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	u64 size = i_size_read(jd->jd_inode);
 
-	if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-	    (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
-		gfs2_consist_inode(ip);
+	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
 		return -EIO;
-	}
-	jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 
-	if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
+	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+	if (gfs2_write_alloc_required(ip, 0, size)) {
 		gfs2_consist_inode(ip);
 		return -EIO;
 	}
-- 
cgit v1.2.3


From 820969f353587281d645735c83c7f07d606e67ba Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 11 Aug 2010 09:53:47 +0100
Subject: GFS2: No longer experimental

I think the time has arrvied to remove the experimental tag
from GFS2.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LBDAF)
+	depends on (64BIT || LBDAF)
 	select DLM if GFS2_FS_LOCKING_DLM
 	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
 	select SYSFS if GFS2_FS_LOCKING_DLM
-- 
cgit v1.2.3


From 9a3f236d40a99ea8dca3df40d8ef67631057cad6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 23 Aug 2010 11:49:34 +0100
Subject: GFS2: Add a bug trap in allocation code

This adds a check to ensure that if we reach the block allocator
that we don't try and proceed if there is no alloc structure
hanging off the inode. This should only happen if there is a bug
in GFS2. The error return code is distinctive in order that it
will be easily spotted.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 370c29b536ea..66b6d4d8b1d2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1496,11 +1496,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_rgrpd *rgd = al->al_rgd;
+	struct gfs2_rgrpd *rgd;
 	u32 goal, blk;
 	u64 block;
 	int error;
 
+	/* Only happens if there is a bug in gfs2, return something distinctive
+	 * to ensure that it is noticed.
+	 */
+	if (al == NULL)
+		return -ECANCELED;
+
+	rgd = al->al_rgd;
+
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
 	else
-- 
cgit v1.2.3


From 3921120e757f9167f3fcd3a1781239824471b14d Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 20 Aug 2010 00:21:02 -0500
Subject: GFS2: fallocate support

This patch adds support for fallocate to gfs2.  Since the gfs2 does not support
uninitialized data blocks, it must write out zeros to all the blocks.  However,
since it does not need to lock any pages to read from, gfs2 can write out the
zero blocks much more efficiently.  On a moderately full filesystem, fallocate
works around 5 times faster on average.  The fallocate call also allows gfs2 to
add blocks to the file without changing the filesize, which will make it
possible for gfs2 to preallocate space for the rindex file, so that gfs2 can
grow a completely full filesystem.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c      |   4 +-
 fs/gfs2/incore.h    |   1 +
 fs/gfs2/inode.h     |   2 +
 fs/gfs2/ops_inode.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/rgrp.c      |  12 +++
 fs/gfs2/trans.h     |   1 +
 6 files changed, 272 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c92f36ba3fc9..180ef8a6de6b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
 #include "glops.h"
 
 
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int to)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+			    unsigned int from, unsigned int to)
 {
 	struct buffer_head *head = page_buffers(page);
 	unsigned int bsize = head->b_size;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c11971775275..578234bb03f8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -571,6 +571,7 @@ struct gfs2_sbd {
 	struct list_head sd_rindex_mru_list;
 	struct gfs2_rgrpd *sd_rindex_forward;
 	unsigned int sd_rgrps;
+	unsigned int sd_max_rg_data;
 
 	/* Journal index stuff */
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 15ff4df20aab..6720d7d5fbc6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern int gfs2_internal_read(struct gfs2_inode *ip,
 			      struct file_ra_state *ra_state,
 			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to);
 extern void gfs2_set_aops(struct inode *inode);
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index ee6ffd590418..f6da0d7676e2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -1277,6 +1279,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = block_prepare_write(page, from, to, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = block_prepare_write(page, start, end,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = block_prepare_write(page, start, end, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+ 	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + ip->i_alloc->al_rgd->rd_length;
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1327,6 +1580,7 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 66b6d4d8b1d2..f9ddcf401753 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -589,6 +589,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
 	u64 rgrp_count = i_size_read(inode);
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
 	int error;
 
 	do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 		}
 	}
 
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
@@ -622,6 +628,8 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
 	int error;
 
 	file_ra_state_init(&ra_state, inode->i_mapping);
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 			return error;
 		}
 	}
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
 
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..b849eb7ad37d 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,6 +20,7 @@ struct gfs2_glock;
 #define RES_JDATA	1
 #define RES_DATA	1
 #define RES_LEAF	1
+#define RES_RG_HDR	1
 #define RES_RG_BIT	2
 #define RES_EATTR	1
 #define RES_STATFS	1
-- 
cgit v1.2.3


From fe08d5a89726675a920b0e9bbbe849c46b27a6e5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 23 Aug 2010 11:54:45 +0100
Subject: GFS2: Fix whitespace in previous patch

Removes the offending space

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f6da0d7676e2..ce4f1dfb533f 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1423,7 +1423,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
 		max_data -= tmp;
 	}
 	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
- 	   so it might end up with fewer data blocks */
+	   so it might end up with fewer data blocks */
 	if (max_data <= *data_blocks)
 		return;
 	*data_blocks = max_data;
-- 
cgit v1.2.3


From 7b5e3d5fcf0d6fce66050bd0313a7dc2ae4abc62 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 3 Sep 2010 09:39:20 +0100
Subject: GFS2: Don't enforce min hold time when two demotes occur in rapid
 succession

Due to the design of the VFS, it is quite usual for operations on GFS2
to consist of a lookup (requiring a shared lock) followed by an
operation requiring an exclusive lock. If a remote node has cached an
exclusive lock, then it will receive two demote events in rapid succession
firstly for a shared lock and then to unlocked. The existing min hold time
code was triggering in this case, even if the node was otherwise idle
since the state change time was being updated by the initial demote.

This patch introduces logic to skip the min hold timer in the case that
a "double demote" of this kind has occurred. The min hold timer will
still be used in all other cases.

A new glock flag is introduced which is used to keep track of whether
there have been any newly queued holders since the last glock state
change. The min hold time is only applied if the flag is set.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Tested-by: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/glock.c      | 15 +++++++++++----
 fs/gfs2/incore.h     |  1 +
 fs/gfs2/trace_gfs2.h |  3 ++-
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..8e478e27f1f7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 		else
 			gfs2_glock_put_nolock(gl);
 	}
+	if (held1 && held2 && list_empty(&gl->gl_holders))
+		clear_bit(GLF_QUEUED, &gl->gl_flags);
 
 	gl->gl_state = new_state;
 	gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
 		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
 			insert_pt = &gh2->gh_list;
 	}
+	set_bit(GLF_QUEUED, &gl->gl_flags);
 	if (likely(insert_pt == NULL)) {
 		list_add_tail(&gh->gh_list, &gl->gl_holders);
 		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 
 	gfs2_glock_hold(gl);
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
-	if (time_before(now, holdtime))
-		delay = holdtime - now;
-	if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
-		delay = gl->gl_ops->go_min_hold_time;
+	if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+			delay = gl->gl_ops->go_min_hold_time;
+	}
 
 	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, delay);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'I';
 	if (test_bit(GLF_FROZEN, gflags))
 		*p++ = 'F';
+	if (test_bit(GLF_QUEUED, gflags))
+		*p++ = 'q';
 	*p = 0;
 	return buf;
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 578234bb03f8..b12ca10481e7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
 	GLF_REPLY_PENDING		= 9,
 	GLF_INITIAL			= 10,
 	GLF_FROZEN			= 11,
+	GLF_QUEUED			= 12,
 };
 
 struct gfs2_glock {
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
 	{(1UL << GLF_INVALIDATE_IN_PROGRESS),	"i" },		\
 	{(1UL << GLF_REPLY_PENDING),		"r" },		\
 	{(1UL << GLF_INITIAL),			"I" },		\
-	{(1UL << GLF_FROZEN),			"F" })
+	{(1UL << GLF_FROZEN),			"F" },		\
+	{(1UL << GLF_QUEUED),			"q" })
 
 #ifndef NUMPTY
 #define NUMPTY
-- 
cgit v1.2.3


From 1fea7c25a05d388c0cdbe02cbdaf3a2e70885581 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Sep 2010 10:09:25 +0100
Subject: GFS2: Update handling of DLM return codes to match reality

GFS2's idea of which return codes it needs to handle was based
upon those listed in dlm.h. Those didn't cover all the possible
codes and listed some which never happen. This updates GFS2 to
handle all the codes which can actually be returned from the
DLM under various circumstances.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lock_dlm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..1c09425b45fd 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
 		ret |= LM_OUT_CANCELED;
 		goto out;
 	case -EAGAIN: /* Try lock fails */
+	case -EDEADLK: /* Deadlock detected */
 		goto out;
-	case -EINVAL: /* Invalid */
-	case -ENOMEM: /* Out of memory */
+	case -ETIMEDOUT: /* Canceled due to timeout */
 		ret |= LM_OUT_ERROR;
 		goto out;
 	case 0: /* Success */
-- 
cgit v1.2.3


From 9fa0ea9f26f64fbfc3dfd51d1dc2c230b65ffb19 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 13 Sep 2010 16:23:00 +0100
Subject: GFS2: Use new workqueue scheme

The recovery workqueue can be freezable since
we want it to finish what it is doing if the system is to
be frozen (although why you'd want to freeze a cluster node
is beyond me since it will result in it being ejected from
the cluster). It does still make sense for single node
GFS2 filesystems though.

The glock workqueue will benefit from being able to run more
work items concurrently. A test running postmark shows
improved performance and multi-threaded workloads are likely
to benefit even more. It needs to be high priority because
the latency directly affects the latency of filesystem glock
operations.

The delete workqueue is similar to the recovery workqueue in
that it must not get blocked by memory allocations, and may
run for a long time.

Potentially other GFS2 threads might also be converted to
workqueues, but I'll leave that for a later patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 fs/gfs2/glock.c | 6 ++++--
 fs/gfs2/main.c  | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8e478e27f1f7..c3f2a5cc8efb 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1783,10 +1783,12 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	glock_workqueue = create_workqueue("glock_workqueue");
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+						WQ_FREEZEABLE, 0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
 		return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..1c8bbf29e602 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -140,7 +140,7 @@ static int __init init_gfs2_fs(void)
 
 	error = -ENOMEM;
 	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-					  WQ_NON_REENTRANT | WQ_RESCUER, 0);
+					  WQ_RESCUER | WQ_FREEZEABLE, 0);
 	if (!gfs_recovery_wq)
 		goto fail_wq;
 
-- 
cgit v1.2.3


From 8d1235852b462cfb66aa036bd4a2686763c69ed4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 17 Sep 2010 12:30:23 +0100
Subject: GFS2: Make . and .. qstrs constant

Rather than calculating the qstrs for . and .. each time
we need them, its better to keep a constant version of
these and just refer to them when required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
---
 fs/gfs2/dir.c       |  3 +++
 fs/gfs2/dir.h       | 34 +++++++++++++++++++---------------
 fs/gfs2/export.c    |  9 +--------
 fs/gfs2/main.c      |  4 ++++
 fs/gfs2/ops_inode.c | 24 ++++++------------------
 5 files changed, 33 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c1042ae438cc..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
 
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
 typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
 			    u64 leaf_no, void *data);
 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
 
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-		   const struct gfs2_inode *ip);
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-		 const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir);
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-		   const struct gfs2_inode *nip, unsigned int new_type);
+extern struct inode *gfs2_dir_search(struct inode *dir,
+				     const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+			  const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+			const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+			 filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+			  const struct gfs2_inode *nip, unsigned int new_type);
 
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
-int gfs2_diradd_alloc_required(struct inode *dir,
-			       const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-			    struct buffer_head **bhp);
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+				      const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+				   struct buffer_head **bhp);
 
 static inline u32 gfs2_disk_hash(const char *data, int len)
 {
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
 	memcpy(dent + 1, name->name, name->len);
 }
 
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
 #endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..06d582732d34 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-	struct qstr dotdot;
 	struct dentry *dentry;
 
-	/*
-	 * XXX(hch): it would be a good idea to keep this around as a
-	 *	     static variable.
-	 */
-	gfs2_str2qstr(&dotdot, "..");
-
-	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
 	if (!IS_ERR(dentry))
 		dentry->d_op = &gfs2_dops;
 	return dentry;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 1c8bbf29e602..d7eb1e209aa8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
 #include "glock.h"
 #include "quota.h"
 #include "recovery.h"
+#include "dir.h"
 
 static struct shrinker qd_shrinker = {
 	.shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
 {
 	int error;
 
+	gfs2_str2qstr(&gfs2_qdot, ".");
+	gfs2_str2qstr(&gfs2_qdotdot, "..");
+
 	error = gfs2_sys_init();
 	if (error)
 		return error;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index ce4f1dfb533f..98a94cfc2bb2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -471,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!gfs2_assert_withdraw(sdp, !error)) {
 		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
 		struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-		struct qstr str;
 
-		gfs2_str2qstr(&str, ".");
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+		gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
 		dent->de_inum = di->di_num; /* already GFS2 endian */
 		dent->de_type = cpu_to_be16(DT_DIR);
 		di->di_entries = cpu_to_be32(1);
 
-		gfs2_str2qstr(&str, "..");
 		dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-		gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+		gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
 
 		gfs2_inum_out(dip, dent);
 		dent->de_type = cpu_to_be16(DT_DIR);
@@ -523,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 		       struct gfs2_inode *ip)
 {
-	struct qstr dotname;
 	int error;
 
 	if (ip->i_entries != 2) {
@@ -540,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 	if (error)
 		return error;
 
-	gfs2_str2qstr(&dotname, ".");
-	error = gfs2_dir_del(ip, &dotname);
+	error = gfs2_dir_del(ip, &gfs2_qdot);
 	if (error)
 		return error;
 
-	gfs2_str2qstr(&dotname, "..");
-	error = gfs2_dir_del(ip, &dotname);
+	error = gfs2_dir_del(ip, &gfs2_qdotdot);
 	if (error)
 		return error;
 
@@ -695,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 	struct inode *dir = &to->i_inode;
 	struct super_block *sb = dir->i_sb;
 	struct inode *tmp;
-	struct qstr dotdot;
 	int error = 0;
 
-	gfs2_str2qstr(&dotdot, "..");
-
 	igrab(dir);
 
 	for (;;) {
@@ -712,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 			break;
 		}
 
-		tmp = gfs2_lookupi(dir, &dotdot, 1);
+		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
 		if (IS_ERR(tmp)) {
 			error = PTR_ERR(tmp);
 			break;
@@ -921,9 +912,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	}
 
 	if (dir_rename) {
-		struct qstr name;
-		gfs2_str2qstr(&name, "..");
-
 		error = gfs2_change_nlink(ndip, +1);
 		if (error)
 			goto out_end_trans;
@@ -931,7 +919,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_end_trans;
 
-		error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
 		if (error)
 			goto out_end_trans;
 	} else {
-- 
cgit v1.2.3


From 749ef9f8423054e326f3a246327ed2db4b6d395f Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 20 Sep 2010 15:24:50 +0200
Subject: cfq: improve fsync performance for small files

Fsync performance for small files achieved by cfq on high-end disks is
lower than what deadline can achieve, due to idling introduced between
the sync write happening in process context and the journal commit.

Moreover, when competing with a sequential reader, a process writing
small files and fsync-ing them is starved.

This patch fixes the two problems by:
- marking journal commits as WRITE_SYNC, so that they get the REQ_NOIDLE
  flag set,
- force all queues that have REQ_NOIDLE requests to be put in the noidle
  tree.

Having the queue associated to the fsync-ing process and the one associated
 to journal commits in the noidle tree allows:
- switching between them without idling,
- fairness vs. competing idling queues, since they will be serviced only
  after the noidle tree expires its slice.

Acked-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ++++--------------
 fs/jbd/commit.c     |  2 +-
 fs/jbd2/commit.c    |  2 +-
 3 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b9f86190763b..684592621736 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -216,7 +216,6 @@ struct cfq_data {
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
-	bool noidle_tree_requires_idle;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -2126,7 +2125,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
-	cfqd->noidle_tree_requires_idle = false;
 }
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3108,7 +3106,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+		enable_idle = 0;
+	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
@@ -3421,17 +3421,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
-			cfqd->noidle_tree_requires_idle |=
-				!(rq->cmd_flags & REQ_NOIDLE);
-			/*
-			 * Idling is enabled for SYNC_WORKLOAD.
-			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-			 * only if we processed at least one !REQ_NOIDLE request
-			 */
-			if (cfqd->serving_type == SYNC_WORKLOAD
-			    || cfqd->noidle_tree_requires_idle
-			    || cfqq->cfqg->nr_cfqq == 1)
-				cfq_arm_slice_timer(cfqd);
+			cfq_arm_slice_timer(cfqd);
 		}
 	}
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..3f030e9efea6 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -318,7 +318,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..80910f51d4b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -360,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
-- 
cgit v1.2.3


From 817f2c842d6c38acfd58d20d29ba583ec467ae35 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 20 Sep 2010 11:44:00 +0530
Subject: Fix various typos of valid in comments

Fix various typos of valid.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/char/pcmcia/cm4000_cs.c          | 2 +-
 drivers/media/video/zoran/zoran_driver.c | 2 +-
 drivers/s390/block/dasd_eckd.c           | 4 ++--
 fs/jfs/jfs_logmgr.c                      | 6 ++----
 fs/ocfs2/cluster/tcp_internal.h          | 2 +-
 include/linux/libata.h                   | 2 +-
 6 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index ec73d9f6d9ed..273be76be988 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1667,7 +1667,7 @@ static int cmm_open(struct inode *inode, struct file *filp)
 	/* opening will always block since the
 	 * monitor will be started by open, which
 	 * means we have to wait for ATR becoming
-	 * vaild = block until valid (or card
+	 * valid = block until valid (or card
 	 * inserted)
 	 */
 	if (filp->f_flags & O_NONBLOCK) {
diff --git a/drivers/media/video/zoran/zoran_driver.c b/drivers/media/video/zoran/zoran_driver.c
index 6f89d0a096ea..3c471a4e3e4a 100644
--- a/drivers/media/video/zoran/zoran_driver.c
+++ b/drivers/media/video/zoran/zoran_driver.c
@@ -1177,7 +1177,7 @@ static int setup_window(struct zoran_fh *fh, int x, int y, int width, int height
 	if (height > BUZ_MAX_HEIGHT)
 		height = BUZ_MAX_HEIGHT;
 
-	/* Check for vaild parameters */
+	/* Check for invalid parameters */
 	if (width < BUZ_MIN_WIDTH || height < BUZ_MIN_HEIGHT ||
 	    width > BUZ_MAX_WIDTH || height > BUZ_MAX_HEIGHT) {
 		dprintk(1,
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 66360c24bd48..59b4ecfb967b 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -1190,7 +1190,7 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
 		goto out_err2;
 	}
 	/*
-	 * dasd_eckd_vaildate_server is done on the first device that
+	 * dasd_eckd_validate_server is done on the first device that
 	 * is found for an LCU. All later other devices have to wait
 	 * for it, so they will read the correct feature codes.
 	 */
@@ -1216,7 +1216,7 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
 				"Read device characteristic failed, rc=%d", rc);
 		goto out_err3;
 	}
-	/* find the vaild cylinder size */
+	/* find the valid cylinder size */
 	if (private->rdc_data.no_cyl == LV_COMPAT_CYL &&
 	    private->rdc_data.long_no_cyl)
 		private->real_cyl = private->rdc_data.long_no_cyl;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a14516..e1b8493b9aaa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 		 * option 2 - shutdown file systems
 		 *	      associated with log ?
 		 * option 3 - extend log ?
-		 */
-		/*
 		 * option 4 - second chance
 		 *
 		 * mark log wrapped, and continue.
 		 * when all active transactions are completed,
-		 * mark log vaild for recovery.
+		 * mark log valid for recovery.
 		 * if crashed during invalid state, log state
-		 * implies invald log, forcing fsck().
+		 * implies invalid log, forcing fsck().
 		 */
 		/* mark log state log wrap in log superblock */
 		/* log->state = LOGWRAP; */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530c..15fdbdf9eb4b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
 
 struct o2net_sock_container {
 	struct kref		sc_kref;
-	/* the next two are vaild for the life time of the sc */
+	/* the next two are valid for the life time of the sc */
 	struct socket		*sc_sock;
 	struct o2nm_node	*sc_node;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f010f18a0f86..9eee84b24a4c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -340,7 +340,7 @@ enum {
 	ATA_EHI_DID_HARDRESET	= (1 << 17), /* already soft-reset this port */
 	ATA_EHI_PRINTINFO	= (1 << 18), /* print configuration info */
 	ATA_EHI_SETMODE		= (1 << 19), /* configure transfer mode */
-	ATA_EHI_POST_SETMODE	= (1 << 20), /* revaildating after setmode */
+	ATA_EHI_POST_SETMODE	= (1 << 20), /* revalidating after setmode */
 
 	ATA_EHI_DID_RESET	= ATA_EHI_DID_SOFTRESET | ATA_EHI_DID_HARDRESET,
 
-- 
cgit v1.2.3


From f7732d6573c4f29fc1ca5d384bbf82ddfa115030 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 21 Sep 2010 16:52:40 -0400
Subject: NFS: Fix a use-after-free case in nfs_async_rename()

The call to nfs_async_rename_release() after rpc_run_task() is incorrect.
The rpc_run_task() is always guaranteed to call the ->rpc_release() method.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/unlink.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 698b3e6367ff..47530aacebfd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -426,7 +426,6 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 		.rpc_client = NFS_CLIENT(old_dir),
 		.flags = RPC_TASK_ASYNC,
 	};
-	struct rpc_task *task;
 
 	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
@@ -435,7 +434,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 
 	data->cred = rpc_lookup_cred();
 	if (IS_ERR(data->cred)) {
-		task = (struct rpc_task *)data->cred;
+		struct rpc_task *task = ERR_CAST(data->cred);
 		kfree(data);
 		return task;
 	}
@@ -468,11 +467,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 
 	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
 
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		nfs_async_rename_release(data);
-
-	return task;
+	return rpc_run_task(&task_setup_data);
 }
 
 /**
-- 
cgit v1.2.3


From d688e11007419fd060ae74d8d952a5c4ece735aa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 21 Sep 2010 16:52:40 -0400
Subject: NFSv4.1: Fix the slotid initialisation in nfs_async_rename()

This fixes an Oopsable condition that was introduced by commit
d3d4152a5d59af9e13a73efa9e9c24383fbe307f (nfs: make sillyrename an async
operation)

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c46e45e9b33f..72aa7067b85d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2550,6 +2550,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 
 	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
+	res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
 
@@ -2575,6 +2576,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
 	arg->bitmask = server->attr_bitmask;
 	res->server = server;
+	res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 
 static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
-- 
cgit v1.2.3


From 609588928fae2c977c887d6d31b1f0aae60ea09e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 21 Sep 2010 16:55:31 -0400
Subject: NFS: Convert nfsiod to use alloc_workqueue()

create_singlethread_workqueue() is deprecated.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2f9266406afc..2ff814272dcf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1493,7 +1493,7 @@ static int nfsiod_start(void)
 {
 	struct workqueue_struct *wq;
 	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = create_singlethread_workqueue("nfsiod");
+	wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
 	if (wq == NULL)
 		return -ENOMEM;
 	nfsiod_workqueue = wq;
-- 
cgit v1.2.3


From d141d97437a3c84aa18cfd5c8d91b89c4173f25c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Sep 2010 16:55:47 -0400
Subject: NFS: Fix NFSv3 debugging messages in fs/nfs/nfs3proc.c

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4e9d941ab548..f8446b38dcb4 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -671,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 
 	nfs_free_fattr(res.dir_attr);
 out:
-	dprintk("NFS reply readdir: %d\n", status);
+	dprintk("NFS reply readdir%s: %d\n",
+			plus? "plus" : "", status);
 	return status;
 }
 
@@ -741,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 	dprintk("NFS call  fsstat\n");
 	nfs_fattr_init(stat->fattr);
 	status = rpc_call_sync(server->client, &msg, 0);
-	dprintk("NFS reply statfs: %d\n", status);
+	dprintk("NFS reply fsstat: %d\n", status);
 	return status;
 }
 
-- 
cgit v1.2.3


From b4687da7fc5f741af7fee9b0248a2cf2ad9c4478 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Sep 2010 16:55:48 -0400
Subject: SUNRPC: Refactor logic to NUL-terminate strings in pages

Clean up: Introduce a helper to '\0'-terminate XDR strings
that are placed in a page in the page cache.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c           |  6 +-----
 fs/nfs/nfs3xdr.c           |  6 +-----
 fs/nfs/nfs4xdr.c           |  5 +----
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 17 +++++++++++++++++
 5 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e15bc0306d0c..79c74387a2fe 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -596,7 +596,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	struct kvec *iov = rcvbuf->head;
 	size_t hdrlen;
 	u32 len, recvd;
-	char	*kaddr;
 	int	status;
 
 	if ((status = ntohl(*p++)))
@@ -623,10 +622,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 		return -EIO;
 	}
 
-	/* NULL terminate the string we got */
-	kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-	kaddr[len+rcvbuf->page_base] = '\0';
-	kunmap_atomic(kaddr, KM_USER0);
+	xdr_terminate_string(rcvbuf, len);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 89c40f8ec6e6..52b2fda66e63 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -824,7 +824,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	struct kvec *iov = rcvbuf->head;
 	size_t hdrlen;
 	u32 len, recvd;
-	char	*kaddr;
 	int	status;
 
 	status = ntohl(*p++);
@@ -857,10 +856,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 		return -EIO;
 	}
 
-	/* NULL terminate the string we got */
-	kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-	kaddr[len+rcvbuf->page_base] = '\0';
-	kunmap_atomic(kaddr, KM_USER0);
+	xdr_terminate_string(rcvbuf, len);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b0bd7efe8100..86ab69eb149c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4299,7 +4299,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	size_t hdrlen;
 	u32 len, recvd;
 	__be32 *p;
-	char *kaddr;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4329,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	 * and and null-terminate the text (the VFS expects
 	 * null-termination).
 	 */
-	kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-	kaddr[len+rcvbuf->page_base] = '\0';
-	kunmap_atomic(kaddr, KM_USER0);
+	xdr_terminate_string(rcvbuf, len);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 35cf2e8cd7c6..8c1dcbb54d89 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -108,6 +108,7 @@ void	xdr_encode_pages(struct xdr_buf *, struct page **, unsigned int,
 			 unsigned int);
 void	xdr_inline_pages(struct xdr_buf *, unsigned int,
 			 struct page **, unsigned int, unsigned int);
+void	xdr_terminate_string(struct xdr_buf *, const u32);
 
 static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len)
 {
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 3bbef7f5f826..e0725d9d8107 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -111,6 +111,23 @@ xdr_decode_string_inplace(__be32 *p, char **sp,
 }
 EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
 
+/**
+ * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
+ * @buf: XDR buffer where string resides
+ * @len: length of string, in bytes
+ *
+ */
+void
+xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+{
+	char *kaddr;
+
+	kaddr = kmap_atomic(buf->pages[0], KM_USER0);
+	kaddr[buf->page_base + len] = '\0';
+	kunmap_atomic(kaddr, KM_USER0);
+}
+EXPORT_SYMBOL(xdr_terminate_string);
+
 void
 xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
 		 unsigned int len)
-- 
cgit v1.2.3


From 8ff30fa4eff2ff9e207961c654caa093f0c84873 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Aug 2010 17:04:07 +1000
Subject: nfsd: disable deferral for NFSv4

Now that a slight delay in getting a reply to an upcall doesn't
require deferring of requests, request deferral for all NFSv4
requests - the concept doesn't really fit with the v4 model.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..0cdfd022bb7b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->cstate.session = NULL;
 	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
 	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
-	/* Use the deferral mechanism only for NFSv4.0 compounds */
-	rqstp->rq_usedeferral = (args->minorversion == 0);
+	/*
+	 * Don't use the deferral mechanism for NFSv4; compounds make it
+	 * too hard to avoid non-idempotency problems.
+	 */
+	rqstp->rq_usedeferral = 0;
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
-- 
cgit v1.2.3


From 839049a8732d689d02051e0198fb60a22f7ccb4b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Aug 2010 17:04:06 +1000
Subject: nfsd/idmap: drop special request deferal in favour of improved
 default.

The idmap code manages request deferal by waiting for a reply from
userspace rather than putting the NFS request on a queue to be retried
from the start.
Now that the common deferal code does this there is no need for the
special code in idmap.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4idmap.c | 105 ++++++----------------------------------------------
 1 file changed, 11 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 808b33a4a090..f0695e815f0e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
 	cache_unregister(&nametoid_cache);
 }
 
-/*
- * Deferred request handling
- */
-
-struct idmap_defer_req {
-       struct cache_req		req;
-       struct cache_deferred_req deferred_req;
-       wait_queue_head_t	waitq;
-       atomic_t			count;
-};
-
-static inline void
-put_mdr(struct idmap_defer_req *mdr)
-{
-	if (atomic_dec_and_test(&mdr->count))
-		kfree(mdr);
-}
-
-static inline void
-get_mdr(struct idmap_defer_req *mdr)
-{
-	atomic_inc(&mdr->count);
-}
-
-static void
-idmap_revisit(struct cache_deferred_req *dreq, int toomany)
-{
-	struct idmap_defer_req *mdr =
-		container_of(dreq, struct idmap_defer_req, deferred_req);
-
-	wake_up(&mdr->waitq);
-	put_mdr(mdr);
-}
-
-static struct cache_deferred_req *
-idmap_defer(struct cache_req *req)
-{
-	struct idmap_defer_req *mdr =
-		container_of(req, struct idmap_defer_req, req);
-
-	mdr->deferred_req.revisit = idmap_revisit;
-	get_mdr(mdr);
-	return (&mdr->deferred_req);
-}
-
-static inline int
-do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
-		struct cache_detail *detail, struct ent **item,
-		struct idmap_defer_req *mdr)
-{
-	*item = lookup_fn(key);
-	if (!*item)
-		return -ENOMEM;
-	return cache_check(detail, &(*item)->h, &mdr->req);
-}
-
-static inline int
-do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
-			struct ent *key, struct cache_detail *detail,
-			struct ent **item)
-{
-	int ret = -ENOMEM;
-
-	*item = lookup_fn(key);
-	if (!*item)
-		goto out_err;
-	ret = -ETIMEDOUT;
-	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-			|| (*item)->h.expiry_time < seconds_since_boot()
-			|| detail->flush_time > (*item)->h.last_refresh)
-		goto out_put;
-	ret = -ENOENT;
-	if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
-		goto out_put;
-	return 0;
-out_put:
-	cache_put(&(*item)->h, detail);
-out_err:
-	*item = NULL;
-	return ret;
-}
-
 static int
 idmap_lookup(struct svc_rqst *rqstp,
 		struct ent *(*lookup_fn)(struct ent *), struct ent *key,
 		struct cache_detail *detail, struct ent **item)
 {
-	struct idmap_defer_req *mdr;
 	int ret;
 
-	mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
-	if (!mdr)
+	*item = lookup_fn(key);
+	if (!*item)
 		return -ENOMEM;
-	atomic_set(&mdr->count, 1);
-	init_waitqueue_head(&mdr->waitq);
-	mdr->req.defer = idmap_defer;
-	ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr);
-	if (ret == -EAGAIN) {
-		wait_event_interruptible_timeout(mdr->waitq,
-			test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ);
-		ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item);
+ retry:
+	ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
+
+	if (ret == -ETIMEDOUT) {
+		struct ent *prev_item = *item;
+		*item = lookup_fn(key);
+		if (*item != prev_item)
+			goto retry;
+		cache_put(&(*item)->h, detail);
 	}
-	put_mdr(mdr);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 63185942c5f138c62de16b4cbc7eee494a58fea8 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 22 Sep 2010 09:50:35 -0400
Subject: lockd: Remove BKL from the client

This patch removes all calls to lock_kernel() from the client.  This patch
should be applied after the "fs/lock.c prepare for BKL removal" patch submitted
by Arnd Bergmann on September 18.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c | 15 ++++++++++-----
 fs/lockd/clntproc.c | 13 ++++---------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..d5bb86866e6c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
 };
 
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 
 /**
  * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
 		block->b_lock = fl;
 		init_waitqueue_head(&block->b_wait);
 		block->b_status = nlm_lck_blocked;
+
+		spin_lock(&nlm_blocked_lock);
 		list_add(&block->b_list, &nlm_blocked);
+		spin_unlock(&nlm_blocked_lock);
 	}
 	return block;
 }
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
 {
 	if (block == NULL)
 		return;
+	spin_lock(&nlm_blocked_lock);
 	list_del(&block->b_list);
+	spin_unlock(&nlm_blocked_lock);
 	kfree(block);
 }
 
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 	 * Look up blocked request based on arguments. 
 	 * Warning: must not use cookie to match it!
 	 */
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		struct file_lock *fl_blocked = block->b_lock;
 
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		wake_up(&block->b_wait);
 		res = nlm_granted;
 	}
+	spin_unlock(&nlm_blocked_lock);
 	return res;
 }
 
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
 	allow_signal(SIGKILL);
 
 	down_write(&host->h_rwsem);
-
-	/* This one ensures that our parent doesn't terminate while the
-	 * reclaim is in progress */
-	lock_kernel();
 	lockd_up();	/* note: this cannot fail as lockd is already running */
 
 	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
 	dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (block->b_host == host) {
 			block->b_status = nlm_lck_denied_grace_period;
 			wake_up(&block->b_wait);
 		}
 	}
+	spin_unlock(&nlm_blocked_lock);
 
 	/* Release host handle after use */
 	nlm_release_host(host);
 	lockd_down();
-	unlock_kernel();
 	return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..47ea1e1925b8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(call, fl);
 
-	lock_kernel();
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
 		if (fl->fl_type != F_UNLCK) {
 			call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 		status = nlmclnt_test(call, fl);
 	else
 		status = -EINVAL;
-
 	fl->fl_ops->fl_release_private(fl);
 	fl->fl_ops = NULL;
-	unlock_kernel();
 
 	dprintk("lockd: clnt proc returns %d\n", status);
 	return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
 
 static void nlmclnt_rpc_release(void *data)
 {
-	lock_kernel();
 	nlm_release_call(data);
-	unlock_kernel();
 }
 
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
 
 static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
 	new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
 	list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 }
 
 static void nlmclnt_locks_release_private(struct file_lock *fl)
 {
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	list_del(&fl->fl_u.nfs_fl.list);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
 
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
 	return;
  retry_rebind:
-	lock_kernel();
 	nlm_rebind_host(req->a_host);
-	unlock_kernel();
  retry_unlock:
 	rpc_restart_call(task);
 }
@@ -801,9 +798,7 @@ retry_cancel:
 	/* Don't ever retry more than 3 times */
 	if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
 		goto die;
-	lock_kernel();
 	nlm_rebind_host(req->a_host);
-	unlock_kernel();
 	rpc_restart_call(task);
 	rpc_delay(task, 30 * HZ);
 }
-- 
cgit v1.2.3


From f904be9cc77f361d37d71468b13ff3d1a1823dea Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 21 Sep 2010 16:38:12 -0400
Subject: lockd: Mostly remove BKL from the server

This patch removes all but one call to lock_kernel() from the server.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc4proc.c |  2 --
 fs/lockd/svclock.c  | 31 +++++++++++++++++++++----------
 fs/lockd/svcproc.c  |  2 --
 3 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..a336e832475d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlm4svc_callback_release(void *data)
 {
-	lock_kernel();
 	nlm_release_call(data);
-	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..6f1ef000975a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
  * The list of blocked locks to retry
  */
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 
 /*
  * Insert a blocked lock into the global list
  */
 static void
-nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
 {
 	struct nlm_block *b;
 	struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
 	block->b_when = when;
 }
 
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+	spin_lock(&nlm_blocked_lock);
+	nlmsvc_insert_block_locked(block, when);
+	spin_unlock(&nlm_blocked_lock);
+}
+
 /*
  * Remove a block from the global list
  */
@@ -94,7 +102,9 @@ static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
 	if (!list_empty(&block->b_list)) {
+		spin_lock(&nlm_blocked_lock);
 		list_del_init(&block->b_list);
+		spin_unlock(&nlm_blocked_lock);
 		nlmsvc_release_block(block);
 	}
 }
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
 	struct nlm_block *block;
 	int rc = -ENOENT;
 
-	lock_kernel();
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
 			dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
 			} else if (result == 0)
 				block->b_granted = 1;
 
-			nlmsvc_insert_block(block, 0);
+			nlmsvc_insert_block_locked(block, 0);
 			svc_wake_up(block->b_daemon);
 			rc = 0;
 			break;
 		}
 	}
-	unlock_kernel();
+	spin_unlock(&nlm_blocked_lock);
 	if (rc == -ENOENT)
 		printk(KERN_WARNING "lockd: grant for unknown block\n");
 	return rc;
@@ -803,7 +813,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 
 	dprintk("lockd: GRANT_MSG RPC callback\n");
 
-	lock_kernel();
+	spin_lock(&nlm_blocked_lock);
 	/* if the block is not on a list at this point then it has
 	 * been invalidated. Don't try to requeue it.
 	 *
@@ -825,19 +835,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 		/* Call was successful, now wait for client callback */
 		timeout = 60 * HZ;
 	}
-	nlmsvc_insert_block(block, timeout);
+	nlmsvc_insert_block_locked(block, timeout);
 	svc_wake_up(block->b_daemon);
 out:
-	unlock_kernel();
+	spin_unlock(&nlm_blocked_lock);
 }
 
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
 static void nlmsvc_grant_release(void *data)
 {
 	struct nlm_rqst		*call = data;
-
-	lock_kernel();
 	nlmsvc_release_block(call->a_block);
-	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..c3069f38d602 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlmsvc_callback_release(void *data)
 {
-	lock_kernel();
 	nlm_release_call(data);
-	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlmsvc_callback_ops = {
-- 
cgit v1.2.3


From c67874f942e30039442d925b03793e0a46ddcddd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 22 Sep 2010 12:55:07 +1000
Subject: nfsd: formally deprecate legacy nfsd syscall interface

The syscall interface is has been replaced by a more flexible
interface since 2.6.0.  It is time to work towards discarding
the old interface.

So add a entry in feature-removal-schedule.txt and print a warning
when the interface is used.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 Documentation/feature-removal-schedule.txt | 10 ++++++++++
 fs/nfsd/nfsctl.c                           | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 842aa9de84a6..076a2c02adaf 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -564,3 +564,13 @@ Who:	FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
 
 ----------------------------
 
+What:   access to nfsd auth cache through sys_nfsservctl or '.' files
+        in the 'nfsd' filesystem.
+When:   2.6.40
+Why:    This is a legacy interface which have been replaced by a more
+        dynamic cache.  Continuing to maintain this interface is an
+        unnecessary burden.
+Who:    NeilBrown <neilb@suse.de>
+
+----------------------------
+	
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..7f0fc8861b85 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -121,6 +121,16 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+	static int warned;
+	if (file->f_dentry->d_name.name[0] == '.' && !warned) {
+		char name[sizeof(current->comm)];
+		printk(KERN_INFO
+		       "Warning: \"%s\" uses deprecated NFSD interface: %s."
+		       "  This will be removed in 2.6.40\n",
+		       get_task_comm(name, current),
+		       file->f_dentry->d_name.name);
+		warned = 1;
+	}
 	if (! file->private_data) {
 		/* An attempt to read a transaction file without writing
 		 * causes a 0-byte write so that the file can return
-- 
cgit v1.2.3


From 1e1405673e4e40a94ed7620553eb440a21040402 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 22 Sep 2010 12:55:07 +1000
Subject: nfsd: allow deprecated interface to be compiled out.

Add CONFIG_NFSD_DEPRECATED, default to y.
Only include deprecated interface if this is defined.
This allows distros to remove this interface before the official
removal, and allows developers to test without it.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/Makefile      |  5 +----
 fs/compat.c      |  2 +-
 fs/nfsd/Kconfig  | 12 ++++++++++++
 fs/nfsd/export.c | 22 +++++++++++++++++++---
 fs/nfsd/nfsctl.c | 10 ++++++++++
 5 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..26956fcec917 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
-
-nfsd-$(CONFIG_NFSD)		:= nfsctl.o
-obj-y				+= $(nfsd-y) $(nfsd-m)
-
+obj-$(CONFIG_NFSD_DEPRECATED)	+= nfsctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec1..df5e671f0015 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1963,7 +1963,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
-#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
 	u16			svc32_port;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
 
 	  If unsure, say N.
 
+config NFSD_DEPRECATED
+	bool "Include support for deprecated syscall interface to NFSD"
+	depends on NFSD
+	default y
+	help
+	  The syscall interface to nfsd was obsoleted in 2.6.0 by a new
+	  filesystem based interface.  The old interface is due for removal
+	  in 2.6.40.  If you wish to remove the interface before then
+	  say N.
+
+	  In unsure, say Y.
+
 config NFSD_V2_ACL
 	bool
 	depends on NFSD
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index e56827b88fd2..a3c7d0ceb24f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
 typedef struct auth_domain	svc_client;
 typedef struct svc_export	svc_export;
 
-static void		exp_do_unexport(svc_export *unexp);
-static int		exp_verify_string(char *cp, int max);
-
 /*
  * We have two caches.
  * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
 	return ek;
 }
 
+#ifdef CONFIG_NFSD_DEPRECATED
 static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
 		       struct svc_export *exp)
 {
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
 
 	return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
+#endif
 
 static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
 				     struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
 	return exp;
 }
 
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
  * Hashtable locking. Write locks are placed only by user processes
  * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
 {
 	up_write(&hash_sem);
 }
+#else
+
+/* hash_sem not needed once deprecated interface is removed */
+void exp_readlock(void) {}
+static inline void exp_writelock(void){}
+void exp_readunlock(void) {}
+static inline void exp_writeunlock(void){}
+
+#endif
+
+#ifdef CONFIG_NFSD_DEPRECATED
+static void		exp_do_unexport(svc_export *unexp);
+static int		exp_verify_string(char *cp, int max);
 
 static void exp_fsid_unhash(struct svc_export *exp)
 {
@@ -1147,6 +1160,7 @@ out_unlock:
 	exp_writeunlock();
 	return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 /*
  * Obtain the root fh on behalf of a client.
@@ -1529,6 +1543,7 @@ const struct seq_operations nfs_exports_op = {
 	.show	= e_show,
 };
 
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
  * Add or modify a client.
  * Change requests may involve the list of host addresses. The list of
@@ -1618,6 +1633,7 @@ exp_verify_string(char *cp, int max)
 	printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
 	return 0;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 /*
  * Initialize the exports module.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7f0fc8861b85..b278e444e2f4 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
  */
 enum {
 	NFSD_Root = 1,
+#ifdef CONFIG_NFSD_DEPRECATED
 	NFSD_Svc,
 	NFSD_Add,
 	NFSD_Del,
@@ -29,6 +30,7 @@ enum {
 	NFSD_Unexport,
 	NFSD_Getfd,
 	NFSD_Getfs,
+#endif
 	NFSD_List,
 	NFSD_Export_features,
 	NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
 /*
  * write() for these nodes.
  */
+#ifdef CONFIG_NFSD_DEPRECATED
 static ssize_t write_svc(struct file *file, char *buf, size_t size);
 static ssize_t write_add(struct file *file, char *buf, size_t size);
 static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
 static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+#endif
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+#ifdef CONFIG_NFSD_DEPRECATED
 	[NFSD_Svc] = write_svc,
 	[NFSD_Add] = write_add,
 	[NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Unexport] = write_unexport,
 	[NFSD_Getfd] = write_getfd,
 	[NFSD_Getfs] = write_getfs,
+#endif
 	[NFSD_Fh] = write_filehandle,
 	[NFSD_FO_UnlockIP] = write_unlock_ip,
 	[NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -196,6 +202,7 @@ static const struct file_operations pool_stats_operations = {
  * payload - write methods
  */
 
+#ifdef CONFIG_NFSD_DEPRECATED
 /**
  * write_svc - Start kernel's NFSD server
  *
@@ -491,6 +498,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
  out:
 	return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 /**
  * write_unlock_ip - Release all locks used by a client
@@ -1365,6 +1373,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
 	static struct tree_descr nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
 		[NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
 		[NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
 		[NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1372,6 +1381,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
 		[NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
 		[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
 		[NFSD_Export_features] = {"export_features",
 					&export_features_operations, S_IRUGO},
-- 
cgit v1.2.3


From 8209e2f46752914e94f65469b8312c42dc5d7d8f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 4 Sep 2010 18:52:49 -0700
Subject: fs/seq_file.c: Remove unnecessary casts of private_data

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/seq_file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3c..0e7cb1395a94 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
  */
 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	size_t copied = 0;
 	loff_t pos;
 	size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
  */
 loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	loff_t retval = -EINVAL;
 
 	mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
  */
 int seq_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	kfree(m->buf);
 	kfree(m);
 	return 0;
-- 
cgit v1.2.3


From 0c6d7d5da27df7ec586fc916f06136a6b7ad1f72 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 4 Sep 2010 18:52:48 -0700
Subject: fs/ecryptfs: Remove unnecessary casts of private_data

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ecryptfs/ecryptfs_kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..40186b959429 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -477,7 +477,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
-	return (struct ecryptfs_file_info *)file->private_data;
+	return file->private_data;
 }
 
 static inline void
-- 
cgit v1.2.3


From 72b43570f304634532b4d66c991eef529135f68c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 21:46:02 +0900
Subject: ext2: fix a typo on comment in ext2/inode.c

'excpet' should be 'except'.
'ext3_get_branch' should be 'ext2_get_branch'.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ext2/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..533699c16040 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
  *	the same format as ext2_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
- *	picture as after the successful ext2_get_block(), excpet that in one
+ *	picture as after the successful ext2_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
@@ -662,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
 	mutex_lock(&ei->truncate_mutex);
 	/*
 	 * If the indirect block is missing while we are reading
-	 * the chain(ext3_get_branch() returns -EAGAIN err), or
+	 * the chain(ext2_get_branch() returns -EAGAIN err), or
 	 * if the chain has been changed after we grab the semaphore,
 	 * (either because another process truncated this branch, or
 	 * another get_block allocated this branch) re-grab the chain to see if
-- 
cgit v1.2.3


From f57a024ed2ee00d62515ffd4040f5ded3e0eb853 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Sep 2010 13:41:42 +0100
Subject: GFS2: Remove ignore_local_fs mount argument

This is been a no-op for a very long time now. I'm pretty sure
nobody uses it, but just in case we'll still accept it on the
command line, but ignore it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 1 -
 fs/gfs2/super.c  | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b12ca10481e7..c8a2db1bfe65 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -416,7 +416,6 @@ struct gfs2_args {
 	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
 	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
 	unsigned int ar_spectator:1;		/* Don't get a journal */
-	unsigned int ar_ignore_local_fs:1;	/* Ignore optimisations */
 	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
 	unsigned int ar_localcaching:1;		/* Local caching */
 	unsigned int ar_debug:1;		/* Oops on errors */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e031fa4965a3..06a4a7e8cf7c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -159,7 +159,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_spectator = 1;
 			break;
 		case Opt_ignore_local_fs:
-			args->ar_ignore_local_fs = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_localflocks:
 			args->ar_localflocks = 1;
@@ -1128,7 +1128,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	/* Some flags must not be changed */
 	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
 	    args_neq(&args, &sdp->sd_args, localflocks) ||
 	    args_neq(&args, &sdp->sd_args, localcaching) ||
 	    args_neq(&args, &sdp->sd_args, meta))
@@ -1233,8 +1232,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
 	if (args->ar_spectator)
 		seq_printf(s, ",spectator");
-	if (args->ar_ignore_local_fs)
-		seq_printf(s, ",ignore_local_fs");
 	if (args->ar_localflocks)
 		seq_printf(s, ",localflocks");
 	if (args->ar_localcaching)
-- 
cgit v1.2.3


From 5eebde23223aeb0ad2d9e3be6590ff8bbfab0fc2 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Thu, 23 Sep 2010 08:55:58 -0400
Subject: nfs: introduce mount option '-olocal_lock' to make locks local

NFS clients since 2.6.12 support flock locks by emulating fcntl byte-range
locks. Due to this, some windows applications which seem to use both flock
(share mode lock mapped as flock by Samba) and fcntl locks sequentially on
the same file, can't lock as they falsely assume the file is already locked.
The problem was reported on a setup with windows clients accessing excel files
on a Samba exported share which is originally a NFS mount from a NetApp filer.

Older NFS clients (< 2.6.12) did not see this problem as flock locks were
considered local. To support legacy flock behavior, this patch adds a mount
option "-olocal_lock=" which can take the following values:

   'none'  		- Neither flock locks nor POSIX locks are local
   'flock' 		- flock locks are local
   'posix' 		- fcntl/POSIX locks are local
   'all'		- Both flock locks and POSIX locks are local

Testing:

   - This patch was tested by using -olocal_lock option with different values
     and the NLM calls were noted from the network packet captured.

     'none'  - NLM calls were seen during both flock() and fcntl(), flock lock
   	       was granted, fcntl was denied
     'flock' - no NLM calls for flock(), NLM call was seen for fcntl(),
   	       granted
     'posix' - NLM call was seen for flock() - granted, no NLM call for fcntl()
     'all'   - no NLM calls were seen during both flock() and fcntl()

   - No bugs were seen during NFSv4 locking/unlocking in general and NFSv4
     reboot recovery.

Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  6 +++--
 fs/nfs/file.c             | 45 +++++++++++++++++++++++++-----------
 fs/nfs/super.c            | 59 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs_mount.h |  3 +++
 4 files changed, 97 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4e7df2adb212..5f01f42b3991 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -635,7 +635,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
  */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-	if (!(server->flags & NFS_MOUNT_NONLM))
+	if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+			!(server->flags & NFS_MOUNT_LOCAL_FCNTL))
 		nlmclnt_done(server->nlm_host);
 }
 
@@ -657,7 +658,8 @@ static int nfs_start_lockd(struct nfs_server *server)
 
 	if (nlm_init.nfs_version > 3)
 		return 0;
-	if (server->flags & NFS_MOUNT_NONLM)
+	if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+			(server->flags & NFS_MOUNT_LOCAL_FCNTL))
 		return 0;
 
 	switch (clp->cl_proto) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index eb51bd6201da..59cbe1ba0511 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -684,7 +684,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 	return ret;
 }
 
-static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
@@ -699,7 +700,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 	if (nfs_have_delegation(inode, FMODE_READ))
 		goto out_noconflict;
 
-	if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+	if (is_local)
 		goto out_noconflict;
 
 	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -730,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
 	return res;
 }
 
-static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status;
@@ -745,15 +747,19 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
 	 * 	If we're signalled while cleaning up locks on process exit, we
 	 * 	still need to complete the unlock.
 	 */
-	/* Use local locking if mounted with "-onolock" */
-	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
 		status = do_vfs_lock(filp, fl);
 	return status;
 }
 
-static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status;
@@ -766,8 +772,11 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
 	if (status != 0)
 		goto out;
 
-	/* Use local locking if mounted with "-onolock" */
-	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
 		status = do_vfs_lock(filp, fl);
@@ -791,6 +800,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int ret = -ENOLCK;
+	int is_local = 0;
 
 	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
@@ -804,6 +814,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		goto out_err;
 
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+		is_local = 1;
+
 	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
 		ret = NFS_PROTO(inode)->lock_check_bounds(fl);
 		if (ret < 0)
@@ -811,11 +824,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	}
 
 	if (IS_GETLK(cmd))
-		ret = do_getlk(filp, cmd, fl);
+		ret = do_getlk(filp, cmd, fl, is_local);
 	else if (fl->fl_type == F_UNLCK)
-		ret = do_unlk(filp, cmd, fl);
+		ret = do_unlk(filp, cmd, fl, is_local);
 	else
-		ret = do_setlk(filp, cmd, fl);
+		ret = do_setlk(filp, cmd, fl, is_local);
 out_err:
 	return ret;
 }
@@ -825,6 +838,9 @@ out_err:
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
+	struct inode *inode = filp->f_mapping->host;
+	int is_local = 0;
+
 	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name,
@@ -833,14 +849,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+		is_local = 1;
+
 	/* We're simulating flock() locks using posix locks on the server */
 	fl->fl_owner = (fl_owner_t)filp;
 	fl->fl_start = 0;
 	fl->fl_end = OFFSET_MAX;
 
 	if (fl->fl_type == F_UNLCK)
-		return do_unlk(filp, cmd, fl);
-	return do_setlk(filp, cmd, fl);
+		return do_unlk(filp, cmd, fl, is_local);
+	return do_setlk(filp, cmd, fl, is_local);
 }
 
 /*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ec3966e4706b..2e866d86c220 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 	Opt_lookupcache,
 	Opt_fscache_uniq,
+	Opt_local_lock,
 
 	/* Special mount options */
 	Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
 
 	{ Opt_lookupcache, "lookupcache=%s" },
 	{ Opt_fscache_uniq, "fsc=%s" },
+	{ Opt_local_lock, "local_lock=%s" },
 
 	{ Opt_err, NULL }
 };
@@ -236,6 +238,22 @@ static match_table_t nfs_lookupcache_tokens = {
 	{ Opt_lookupcache_err, NULL }
 };
 
+enum {
+	Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+	Opt_local_lock_none,
+
+	Opt_local_lock_err
+};
+
+static match_table_t nfs_local_lock_tokens = {
+	{ Opt_local_lock_all, "all" },
+	{ Opt_local_lock_flock, "flock" },
+	{ Opt_local_lock_posix, "posix" },
+	{ Opt_local_lock_none, "none" },
+
+	{ Opt_local_lock_err, NULL }
+};
+
 
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
@@ -1009,9 +1027,13 @@ static int nfs_parse_mount_options(char *raw,
 			break;
 		case Opt_lock:
 			mnt->flags &= ~NFS_MOUNT_NONLM;
+			mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+					NFS_MOUNT_LOCAL_FCNTL);
 			break;
 		case Opt_nolock:
 			mnt->flags |= NFS_MOUNT_NONLM;
+			mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+				       NFS_MOUNT_LOCAL_FCNTL);
 			break;
 		case Opt_v2:
 			mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1412,6 +1434,34 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->fscache_uniq = string;
 			mnt->options |= NFS_OPTION_FSCACHE;
 			break;
+		case Opt_local_lock:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string, nfs_local_lock_tokens,
+					args);
+			kfree(string);
+			switch (token) {
+			case Opt_local_lock_all:
+				mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+					       NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			case Opt_local_lock_flock:
+				mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+				break;
+			case Opt_local_lock_posix:
+				mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+				break;
+			case Opt_local_lock_none:
+				mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+						NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			default:
+				dfprintk(MOUNT, "NFS:	invalid	"
+						"local_lock argument\n");
+				return 0;
+			};
+			break;
 
 		/*
 		 * Special options
@@ -1817,6 +1867,12 @@ static int nfs_validate_mount_data(void *options,
 		if (!args->nfs_server.hostname)
 			goto out_nomem;
 
+		if (!(data->flags & NFS_MOUNT_NONLM))
+			args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+					 NFS_MOUNT_LOCAL_FCNTL);
+		else
+			args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+					NFS_MOUNT_LOCAL_FCNTL);
 		/*
 		 * The legacy version 6 binary mount data from userspace has a
 		 * field used only to transport selinux information into the
@@ -2433,7 +2489,8 @@ static void nfs4_fill_super(struct super_block *sb)
 
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 {
-	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 
 static int nfs4_validate_text_mount_data(void *options,
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index 5d59ae861aa6..576bddd72e04 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -71,4 +71,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_NORESVPORT		0x40000
 #define NFS_MOUNT_LEGACY_INTERFACE	0x80000
 
+#define NFS_MOUNT_LOCAL_FLOCK	0x100000
+#define NFS_MOUNT_LOCAL_FCNTL	0x200000
+
 #endif
-- 
cgit v1.2.3


From c2048b003cfb840ad81bdc6eb55beb12a19a222e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 23 Sep 2010 14:00:31 +0100
Subject: GFS2: Remove localcaching mount option

This option defaulted to on for lock_nolock mounts and off
otherwise. The only function was to avoid the revalidation of
dentries. In the cluster case, that is entirely pointless and
liable to cause coherency problems.

The patch changes the revalidation to depend upon whether the
fs is a local or cluster fs (i.e. it follows the existing default
behaviour). I very much doubt anybody ever used this option as
there is no reason to. Even so we will continue to accept it
on the mount command line, but ignore it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dentry.c     | 2 +-
 fs/gfs2/incore.h     | 1 -
 fs/gfs2/ops_fstype.c | 1 -
 fs/gfs2/super.c      | 5 +----
 4 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..6798755b3858 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 		ip = GFS2_I(inode);
 	}
 
-	if (sdp->sd_args.ar_localcaching)
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		goto valid;
 
 	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c8a2db1bfe65..2990a0a57e61 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -417,7 +417,6 @@ struct gfs2_args {
 	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
 	unsigned int ar_spectator:1;		/* Don't get a journal */
 	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
-	unsigned int ar_localcaching:1;		/* Local caching */
 	unsigned int ar_debug:1;		/* Oops on errors */
 	unsigned int ar_upgrade:1;		/* Upgrade ondisk format */
 	unsigned int ar_posix_acl:1;		/* Enable posix acls */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5b5c87dfbfee..558bba493cc4 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1022,7 +1022,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 	if (!strcmp("lock_nolock", proto)) {
 		lm = &nolock_ops;
 		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 	} else if (!strcmp("lock_dlm", proto)) {
 		lm = &gfs2_dlm_ops;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 06a4a7e8cf7c..e78de8bf2728 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -165,7 +165,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_localflocks = 1;
 			break;
 		case Opt_localcaching:
-			args->ar_localcaching = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_debug:
 			if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -1129,7 +1129,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	/* Some flags must not be changed */
 	if (args_neq(&args, &sdp->sd_args, spectator) ||
 	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, localcaching) ||
 	    args_neq(&args, &sdp->sd_args, meta))
 		return -EINVAL;
 
@@ -1234,8 +1233,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",spectator");
 	if (args->ar_localflocks)
 		seq_printf(s, ",localflocks");
-	if (args->ar_localcaching)
-		seq_printf(s, ",localcaching");
 	if (args->ar_debug)
 		seq_printf(s, ",debug");
 	if (args->ar_upgrade)
-- 
cgit v1.2.3


From 049ef27b224ecc33958465fef83d5e4e8a056115 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 23 Sep 2010 18:26:58 +0400
Subject: nfsd: Export get_task_comm for nfsd

The git://linux-nfs.org/~bfields/linux.git nfsd-next branch doesn't
compile when nfsd is a module with the following error:

   ERROR: "get_task_comm" [fs/nfsd/nfsd.ko] undefined!

Replace the get_task_comm call with direct comm access, which is
safe for current.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b278e444e2f4..7b2fa1d25af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -129,12 +129,10 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
 {
 	static int warned;
 	if (file->f_dentry->d_name.name[0] == '.' && !warned) {
-		char name[sizeof(current->comm)];
 		printk(KERN_INFO
 		       "Warning: \"%s\" uses deprecated NFSD interface: %s."
 		       "  This will be removed in 2.6.40\n",
-		       get_task_comm(name, current),
-		       file->f_dentry->d_name.name);
+		       current->comm, file->f_dentry->d_name.name);
 		warned = 1;
 	}
 	if (! file->private_data) {
-- 
cgit v1.2.3


From ef84303ebc77a9041265faaccd56b7fcef151077 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 23 Sep 2010 12:22:09 -0400
Subject: NFS: handle inode==NULL in __put_nfs_open_context

inode may be NULL when put_nfs_open_context is called from nfs_atomic_lookup
before d_add_unique(dentry, inode)

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2ff814272dcf..702ed096e790 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -654,11 +654,14 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
 	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+	if (inode) {
+		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+			return;
+		list_del(&ctx->list);
+		spin_unlock(&inode->i_lock);
+		NFS_PROTO(inode)->close_context(ctx, is_sync);
+	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
 		return;
-	list_del(&ctx->list);
-	spin_unlock(&inode->i_lock);
-	NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
-- 
cgit v1.2.3


From 7c563cc9f3f4aca70c27bd08a135499227f67014 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Thu, 23 Sep 2010 14:26:48 -0400
Subject: nfs: show "local_lock" mount option in /proc/mounts

Display the status of 'local_lock' mount option in /proc/mounts.


Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2e866d86c220..50ed035cabc4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -632,6 +632,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	const struct proc_nfs_info *nfs_infop;
 	struct nfs_client *clp = nfss->nfs_client;
 	u32 version = clp->rpc_ops->version;
+	int local_flock, local_fcntl;
 
 	seq_printf(m, ",vers=%u", version);
 	seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -680,6 +681,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		else
 			seq_printf(m, ",lookupcache=pos");
 	}
+
+	local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+	local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+
+	if (!local_flock && !local_fcntl)
+		seq_printf(m, ",local_lock=none");
+	else if (local_flock && local_fcntl)
+		seq_printf(m, ",local_lock=all");
+	else if (local_flock)
+		seq_printf(m, ",local_lock=flock");
+	else
+		seq_printf(m, ",local_lock=posix");
 }
 
 /*
-- 
cgit v1.2.3


From c80dbb58f9c3f84372e37dfe80b41810052ad62f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 24 Sep 2010 09:55:07 +0100
Subject: GFS2: Remove upgrade mount option

This option has never done anything useful. Also at the same time
this cleans up the sb checks which are done at mount time. The
debug option will be accepted, but ignored in future. Since it
didn't do anything, there didn't seem much point in retaining it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  1 -
 fs/gfs2/ops_fstype.c | 60 ++--------------------------------------------------
 fs/gfs2/super.c      |  4 +---
 3 files changed, 3 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 2990a0a57e61..6f6ff8aba99a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -418,7 +418,6 @@ struct gfs2_args {
 	unsigned int ar_spectator:1;		/* Don't get a journal */
 	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
 	unsigned int ar_debug:1;		/* Oops on errors */
-	unsigned int ar_upgrade:1;		/* Upgrade ondisk format */
 	unsigned int ar_posix_acl:1;		/* Enable posix acls */
 	unsigned int ar_quota:2;		/* off/account/on */
 	unsigned int ar_suiddir:1;		/* suiddir support */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 558bba493cc4..5b11f3768144 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
 #define DO 0
 #define UNDO 1
 
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
 /**
  * gfs2_tune_init - Fill a gfs2_tune structure with default values
  * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-	unsigned int x;
-
 	if (sb->sb_magic != GFS2_MAGIC ||
 	    sb->sb_type != GFS2_METATYPE_SB) {
 		if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
 	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
 		return 0;
 
-	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-		for (x = 0; gfs2_old_fs_formats[x]; x++)
-			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-				break;
-
-		if (!gfs2_old_fs_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-		for (x = 0; gfs2_old_multihost_formats[x]; x++)
-			if (gfs2_old_multihost_formats[x] ==
-			    sb->sb_multihost_format)
-				break;
-
-		if (!gfs2_old_multihost_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
+	fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
 
-	if (!sdp->sd_args.ar_upgrade) {
-		printk(KERN_WARNING
-		       "GFS2: code version (%u, %u) is incompatible "
-		       "with ondisk format (%u, %u)\n",
-		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-		       sb->sb_fs_format, sb->sb_multihost_format);
-		printk(KERN_INFO
-		       "GFS2: Use the \"upgrade\" mount option to upgrade "
-		       "the FS\n");
-		printk(KERN_INFO "GFS2: See the manual for more details\n");
-		return -EINVAL;
-	}
-
-	return 0;
+	return -EINVAL;
 }
 
 static void end_bio_io_page(struct bio *bio, int error)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e78de8bf2728..d85e0b7c1bb4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -179,7 +179,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_debug = 0;
 			break;
 		case Opt_upgrade:
-			args->ar_upgrade = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
@@ -1235,8 +1235,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",localflocks");
 	if (args->ar_debug)
 		seq_printf(s, ",debug");
-	if (args->ar_upgrade)
-		seq_printf(s, ",upgrade");
 	if (args->ar_posix_acl)
 		seq_printf(s, ",acl");
 	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-- 
cgit v1.2.3


From dfb4f309830359352539919f23accc59a20a3758 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 24 Sep 2010 09:17:01 -0400
Subject: NFSv4.1: keep seq_res.sr_slot as pointer rather than an index

Having to explicitly initialize sr_slotid to NFS4_MAX_SLOT_TABLE
resulted in numerous bugs.  Keeping the current slot as a pointer
to the slot table is more straight forward and robust as it's
implicitly set up to NULL wherever the seq_res member is initialized
to zeroes.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 52 ++++++++++++++++++++-----------------------------
 fs/nfs/nfs4xdr.c        |  6 ++----
 fs/nfs/read.c           |  1 -
 fs/nfs/unlink.c         |  3 +--
 fs/nfs/write.c          |  2 --
 include/linux/nfs_xdr.h |  2 +-
 6 files changed, 25 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 72aa7067b85d..aee1bcc1fcc9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -334,10 +334,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * Must be called while holding tbl->slot_tbl_lock
  */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 {
+	int free_slotid = free_slot - tbl->slots;
 	int slotid = free_slotid;
 
+	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
 	/* clear used bit in bitmap */
 	__clear_bit(slotid, tbl->used_slots);
 
@@ -379,7 +381,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 	struct nfs4_slot_table *tbl;
 
 	tbl = &res->sr_session->fc_slot_table;
-	if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+	if (!res->sr_slot) {
 		/* just wake up the next guy waiting since
 		 * we may have not consumed a slot after all */
 		dprintk("%s: No slot\n", __func__);
@@ -387,17 +389,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 	}
 
 	spin_lock(&tbl->slot_tbl_lock);
-	nfs4_free_slot(tbl, res->sr_slotid);
+	nfs4_free_slot(tbl, res->sr_slot);
 	nfs41_check_drain_session_complete(res->sr_session);
 	spin_unlock(&tbl->slot_tbl_lock);
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+	res->sr_slot = NULL;
 }
 
 static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
 	unsigned long timestamp;
-	struct nfs4_slot_table *tbl;
-	struct nfs4_slot *slot;
 	struct nfs_client *clp;
 
 	/*
@@ -410,17 +410,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
 		res->sr_status = NFS_OK;
 
 	/* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
-	if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+	if (!res->sr_slot)
 		goto out;
 
-	tbl = &res->sr_session->fc_slot_table;
-	slot = tbl->slots + res->sr_slotid;
-
 	/* Check the SEQUENCE operation status */
 	switch (res->sr_status) {
 	case 0:
 		/* Update the slot's sequence and clientid lease timer */
-		++slot->seq_nr;
+		++res->sr_slot->seq_nr;
 		timestamp = res->sr_renewal_time;
 		clp = res->sr_session->clp;
 		do_renew_lease(clp, timestamp);
@@ -433,12 +430,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
 		 * returned NFS4ERR_DELAY as per Section 2.10.6.2
 		 * of RFC5661.
 		 */
-		dprintk("%s: slot=%d seq=%d: Operation in progress\n",
-				__func__, res->sr_slotid, slot->seq_nr);
+		dprintk("%s: slot=%ld seq=%d: Operation in progress\n",
+			__func__,
+			res->sr_slot - res->sr_session->fc_slot_table.slots,
+			res->sr_slot->seq_nr);
 		goto out_retry;
 	default:
 		/* Just update the slot sequence no. */
-		++slot->seq_nr;
+		++res->sr_slot->seq_nr;
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
@@ -505,10 +504,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 
 	dprintk("--> %s\n", __func__);
 	/* slot already allocated? */
-	if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+	if (res->sr_slot != NULL)
 		return 0;
 
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 	tbl = &session->fc_slot_table;
 
 	spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +548,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
 
 	res->sr_session = session;
-	res->sr_slotid = slotid;
+	res->sr_slot = slot;
 	res->sr_renewal_time = jiffies;
 	res->sr_status_flags = 0;
 	/*
@@ -576,8 +574,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
 		goto out;
 	}
 
-	dprintk("--> %s clp %p session %p sr_slotid %d\n",
-		__func__, session->clp, session, res->sr_slotid);
+	dprintk("--> %s clp %p session %p sr_slot %ld\n",
+		__func__, session->clp, session, res->sr_slot ?
+			res->sr_slot - session->fc_slot_table.slots : -1);
 
 	ret = nfs41_setup_sequence(session, args, res, cache_reply,
 				   task);
@@ -650,7 +649,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
 		.callback_data = &data
 	};
 
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+	res->sr_slot = NULL;
 	if (privileged)
 		task_setup.callback_ops = &nfs41_call_priv_sync_ops;
 	task = rpc_run_task(&task_setup);
@@ -735,7 +734,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
 	nfs_fattr_init(&p->dir_attr);
-	p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1975,7 +1973,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
-	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	path_get(path);
 	calldata->path = *path;
 
@@ -2550,7 +2547,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 
 	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
-	res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+	res->seq_res.sr_slot = NULL;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
 
@@ -2576,7 +2573,6 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
 	arg->bitmask = server->attr_bitmask;
 	res->server = server;
-	res->seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 
 static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3646,7 +3642,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
 	data->res.fattr = &data->fattr;
 	data->res.server = server;
-	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
@@ -3799,7 +3794,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.fl = &p->fl;
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
-	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->arg.stateid = &lsp->ls_stateid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
@@ -3979,7 +3973,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
 	p->res.lock_seqid = p->arg.lock_seqid;
-	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->lsp = lsp;
 	p->server = server;
 	atomic_inc(&lsp->ls_count);
@@ -4612,7 +4605,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	};
 	int status;
 
-	res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	dprintk("--> %s\n", __func__);
 	task = rpc_run_task(&task_setup);
 
@@ -5105,12 +5097,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
 
 	if (!atomic_inc_not_zero(&clp->cl_count))
 		return ERR_PTR(-EIO);
-	calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
 	if (calldata == NULL) {
 		nfs_put_client(clp);
 		return ERR_PTR(-ENOMEM);
 	}
-	calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	msg.rpc_argp = &calldata->args;
 	msg.rpc_resp = &calldata->res;
 	calldata->clp = clp;
@@ -5242,7 +5233,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 		goto out;
 	calldata->clp = clp;
 	calldata->arg.one_fs = 0;
-	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 
 	msg.rpc_argp = &calldata->arg;
 	msg.rpc_resp = &calldata->res;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 86ab69eb149c..3feace66b981 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4665,7 +4665,6 @@ static int decode_sequence(struct xdr_stream *xdr,
 			   struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
-	struct nfs4_slot *slot;
 	struct nfs4_sessionid id;
 	u32 dummy;
 	int status;
@@ -4697,15 +4696,14 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_overflow;
 
 	/* seqid */
-	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
 	dummy = be32_to_cpup(p++);
-	if (dummy != slot->seq_nr) {
+	if (dummy != res->sr_slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out_err;
 	}
 	/* slot id */
 	dummy = be32_to_cpup(p++);
-	if (dummy != res->sr_slotid) {
+	if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
 		dprintk("%s Invalid slot id\n", __func__);
 		goto out_err;
 	}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..79859c81a943 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,7 +46,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
-		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 47530aacebfd..9a16bad5d2ea 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -262,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 		status = PTR_ERR(data->cred);
 		goto out_free;
 	}
-	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	data->res.dir_attr = &data->dir_attr;
 
 	status = -EBUSY;
@@ -427,7 +426,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 		.flags = RPC_TASK_ASYNC,
 	};
 
-	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
 		return ERR_PTR(-ENOMEM);
 	task_setup_data.callback_data = data,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..4d6d35dd2b4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	}
 	return p;
 }
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
-		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 172df83ac54b..5772b2c2f063 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -170,7 +170,7 @@ struct nfs4_sequence_args {
 
 struct nfs4_sequence_res {
 	struct nfs4_session	*sr_session;
-	u8			sr_slotid;	/* slot used to send request */
+	struct nfs4_slot	*sr_slot;	/* slot used to send request */
 	int			sr_status;	/* sequence operation status */
 	unsigned long		sr_renewal_time;
 	u32			sr_status_flags;
-- 
cgit v1.2.3


From 74ec1e1269eba65b5f8e810cf0363ddb7aa64de5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 24 Sep 2010 17:43:59 -0400
Subject: nfsd: fix /proc/net/rpc/nfsd.export/content display

Note with "first" always 0, and "lastflags" initially 0, we always dump
a spurious set of 0 flags at the start, among other problems.

Fix.  And attempt to make the code a little more obvious.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a3c7d0ceb24f..067e2e612e2d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1470,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
 	show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
 }
 
+static bool secinfo_flags_equal(int f, int g)
+{
+	f &= NFSEXP_SECINFO_FLAGS;
+	g &= NFSEXP_SECINFO_FLAGS;
+	return f == g;
+}
+
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+	int flags;
+
+	flags = (*fp)->flags;
+	seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+	(*fp)++;
+	while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+		seq_printf(m, ":%d", (*fp)->pseudoflavor);
+		(*fp)++;
+	}
+	return flags;
+}
+
 static void show_secinfo(struct seq_file *m, struct svc_export *exp)
 {
 	struct exp_flavor_info *f;
 	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-	int lastflags = 0, first = 0;
+	int flags;
 
 	if (exp->ex_nflavors == 0)
 		return;
-	for (f = exp->ex_flavors; f < end; f++) {
-		if (first || f->flags != lastflags) {
-			if (!first)
-				show_secinfo_flags(m, lastflags);
-			seq_printf(m, ",sec=%d", f->pseudoflavor);
-			lastflags = f->flags;
-		} else {
-			seq_printf(m, ":%d", f->pseudoflavor);
-		}
+	f = exp->ex_flavors;
+	flags = show_secinfo_run(m, &f, end);
+	if (!secinfo_flags_equal(flags, exp->ex_flags))
+		show_secinfo_flags(m, flags);
+	while (f != end) {
+		flags = show_secinfo_run(m, &f, end);
+		show_secinfo_flags(m, flags);
 	}
-	show_secinfo_flags(m, lastflags);
 }
 
 static void exp_flags(struct seq_file *m, int flag, int fsid,
-- 
cgit v1.2.3


From 352114f395bd79353faf0bc1506ead94de393f55 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 27 Sep 2010 13:59:48 +0400
Subject: sunrpc: Add net to pure API calls

There are two calls that operate on ip_map_cache and are
directly called from the nfsd code. Other places will be
handled in a different way.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c               |  2 +-
 fs/nfsd/nfsctl.c               |  4 ++--
 include/linux/sunrpc/svcauth.h |  4 ++--
 net/sunrpc/svcauth_unix.c      | 18 ++++++++++--------
 4 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 067e2e612e2d..c0fcb7ab7f6d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1593,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
 	/* Insert client into hashtable. */
 	for (i = 0; i < ncp->cl_naddr; i++) {
 		ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
-		auth_unix_add_addr(&addr6, dom);
+		auth_unix_add_addr(&init_net, &addr6, dom);
 	}
 	auth_unix_forget_old(dom);
 	auth_domain_put(dom);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7b2fa1d25af7..b6e192d25633 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -416,7 +416,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 
 	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
 
-	clp = auth_unix_lookup(&in6);
+	clp = auth_unix_lookup(&init_net, &in6);
 	if (!clp)
 		err = -EPERM;
 	else {
@@ -479,7 +479,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 
 	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
 
-	clp = auth_unix_lookup(&in6);
+	clp = auth_unix_lookup(&init_net, &in6);
 	if (!clp)
 		err = -EPERM;
 	else {
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 18bce95255a4..25d333c1b571 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -126,10 +126,10 @@ extern void	svc_auth_unregister(rpc_authflavor_t flavor);
 
 extern struct auth_domain *unix_domain_find(char *name);
 extern void auth_domain_put(struct auth_domain *item);
-extern int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom);
+extern int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom);
 extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new);
 extern struct auth_domain *auth_domain_find(char *name);
-extern struct auth_domain *auth_unix_lookup(struct in6_addr *addr);
+extern struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr);
 extern int auth_unix_forget_old(struct auth_domain *dom);
 extern void svcauth_unix_purge(void);
 extern void svcauth_unix_info_release(struct svc_xprt *xpt);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index f4751805ecfe..2a76c7cf603e 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -327,7 +327,8 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
 		return NULL;
 }
 
-static inline struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr)
+static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
+		struct in6_addr *addr)
 {
 	return __ip_map_lookup(&ip_map_cache, class, addr);
 }
@@ -360,12 +361,13 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
 	return 0;
 }
 
-static inline int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry)
+static inline int ip_map_update(struct net *net, struct ip_map *ipm,
+		struct unix_domain *udom, time_t expiry)
 {
 	return __ip_map_update(&ip_map_cache, ipm, udom, expiry);
 }
 
-int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom)
+int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom)
 {
 	struct unix_domain *udom;
 	struct ip_map *ipmp;
@@ -373,10 +375,10 @@ int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom)
 	if (dom->flavour != &svcauth_unix)
 		return -EINVAL;
 	udom = container_of(dom, struct unix_domain, h);
-	ipmp = ip_map_lookup("nfsd", addr);
+	ipmp = ip_map_lookup(net, "nfsd", addr);
 
 	if (ipmp)
-		return ip_map_update(ipmp, udom, NEVER);
+		return ip_map_update(net, ipmp, udom, NEVER);
 	else
 		return -ENOMEM;
 }
@@ -394,12 +396,12 @@ int auth_unix_forget_old(struct auth_domain *dom)
 }
 EXPORT_SYMBOL_GPL(auth_unix_forget_old);
 
-struct auth_domain *auth_unix_lookup(struct in6_addr *addr)
+struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr)
 {
 	struct ip_map *ipm;
 	struct auth_domain *rv;
 
-	ipm = ip_map_lookup("nfsd", addr);
+	ipm = ip_map_lookup(net, "nfsd", addr);
 
 	if (!ipm)
 		return NULL;
@@ -725,7 +727,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 
 	ipm = ip_map_cached_get(xprt);
 	if (ipm == NULL)
-		ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class,
+		ipm = ip_map_lookup(&init_net, rqstp->rq_server->sv_program->pg_class,
 				    &sin6->sin6_addr);
 
 	if (ipm == NULL)
-- 
cgit v1.2.3


From d0795f912318f65b800c6b619d749c3bf7c930fb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 27 Sep 2010 15:58:11 +0100
Subject: GFS2: Fix journal check for spectator mounts

When checking journals for spectator mounts, we cannot rely on the
journal being locked, whatever its jid might be. This patch
ensures that we always get the journal locks when checking
journals for a spectator mount.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..666548e14596 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -456,7 +456,8 @@ void gfs2_recover_func(struct work_struct *work)
 	unsigned int pass;
 	int error;
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+	if (sdp->sd_args.ar_spectator ||
+	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
 		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
 			jd->jd_jid);
 
-- 
cgit v1.2.3


From 54dd55a406f6e9cb5ae208f258b907455162e045 Mon Sep 17 00:00:00 2001
From: Steffen Sledz <sledz@dresearch.de>
Date: Mon, 27 Sep 2010 14:20:26 +0200
Subject: UBIFS: avoid kernel error if ubifs superblock read fails

.get_sb is called on mounts with automatic fs detection too, so this
function should print an error if it cannot read the superblock in
debug mode only (new behaviour conforms the other fs types)

Signed-off-by: Steffen Sledz <sledz@dresearch.de>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4e5bf3fbf782..bb6ed5da2d10 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2054,8 +2054,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	 */
 	ubi = open_ubi(name, UBI_READONLY);
 	if (IS_ERR(ubi)) {
-		ubifs_err("cannot open \"%s\", error %d",
-			  name, (int)PTR_ERR(ubi));
+		dbg_err("cannot open \"%s\", error %d",
+			name, (int)PTR_ERR(ubi));
 		return PTR_ERR(ubi);
 	}
 	ubi_get_volume_info(ubi, &vi);
-- 
cgit v1.2.3


From bf97b6734e027cc18abad420ab88f861f65d7816 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Mon, 27 Sep 2010 16:00:04 -0500
Subject: GFS2: reserve more blocks for transactions

Some of the functions in GFS2 were not reserving space in the transaction for
the resource group header and the resource groups bitblocks that get added
when you do allocation. GFS2 now makes sure to reserve space for the
resource group header and either all the bitblocks in the resource group, or
one for each block that it may allocate, whichever is smaller using the new
gfs2_rg_blocks() inline function.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c      | 2 ++
 fs/gfs2/bmap.c      | 2 +-
 fs/gfs2/file.c      | 4 +++-
 fs/gfs2/ops_inode.c | 6 +++---
 fs/gfs2/quota.c     | 3 ++-
 fs/gfs2/trans.h     | 8 ++++++++
 fs/gfs2/xattr.c     | 2 +-
 7 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 180ef8a6de6b..1bf178831ae8 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		rblocks += RES_STATFS + RES_QUOTA;
 	if (&ip->i_inode == sdp->sd_rindex)
 		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(al);
 
 	error = gfs2_trans_begin(sdp, rblocks,
 				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 04513e997df6..5476c066d4ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1151,7 +1151,7 @@ static int do_grow(struct inode *inode, u64 size)
 			goto do_grow_qunlock;
 	}
 
-	error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
 	if (error)
 		goto do_grow_release;
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index daadcd2e755f..237ee6a940df 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	rblocks = RES_DINODE + ind_blocks;
 	if (gfs2_is_jdata(ip))
 		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
+	if (ind_blocks || data_blocks) {
 		rblocks += RES_STATFS + RES_QUOTA;
+		rblocks += gfs2_rg_blocks(al);
+	}
 	ret = gfs2_trans_begin(sdp, rblocks, 0);
 	if (ret)
 		goto out_trans_fail;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 98a94cfc2bb2..fba00171d915 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -219,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 gfs2_rg_blocks(al) +
 					 2 * RES_DINODE + RES_STATFS +
 					 RES_QUOTA, 0);
 		if (error)
@@ -884,7 +884,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 gfs2_rg_blocks(al) +
 					 4 * RES_DINODE + 4 * RES_LEAF +
 					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
@@ -1481,7 +1481,7 @@ retry:
 		al->al_requested = data_blocks + ind_blocks;
 
 		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-			  RES_RG_HDR + ip->i_alloc->al_rgd->rd_length;
+			  RES_RG_HDR + gfs2_rg_blocks(al);
 		if (gfs2_is_jdata(ip))
 			rblocks += data_blocks ? data_blocks : 1;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 9bc6dd9a5443..58a9b9998b42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -815,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		goto out_alloc;
 
 	if (nalloc)
-		blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+		blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
 
 	error = gfs2_trans_begin(sdp, blocks, 0);
 	if (error)
@@ -1586,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		error = gfs2_inplace_reserve(ip);
 		if (error)
 			goto out_alloc;
+		blocks += gfs2_rg_blocks(al);
 	}
 
 	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index b849eb7ad37d..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -26,6 +26,14 @@ struct gfs2_glock;
 #define RES_STATFS	1
 #define RES_QUOTA	2
 
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+	return (al->al_requested < al->al_rgd->rd_length)?
+	       al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 		     unsigned int revokes);
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..30b58f07c8a6 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		goto out_gunlock_q;
 
 	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + al->al_rgd->rd_length +
+				 blks + gfs2_rg_blocks(al) +
 				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
 	if (error)
 		goto out_ipres;
-- 
cgit v1.2.3


From d594845106f34c079a6b05be01a37e4883c3bf4c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 28 Sep 2010 10:17:47 +0100
Subject: GFS2: Fix compiler warning from previous patch

This shouldn't really be required, but gcc can't tell that
"al" is only accessed when initialised.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1bf178831ae8..6b24afb96aae 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	int alloc_required;
 	int error = 0;
-	struct gfs2_alloc *al;
+	struct gfs2_alloc *al = NULL;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned to = from + len;
-- 
cgit v1.2.3


From c741c4551237f9c1bdcd3b1b39b0883bd19a3723 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Sep 2010 14:20:52 +0100
Subject: GFS2: Fix spectator umount issue

The tests further down the recovery function relating to
unlocking the journal need to be updated to match the
intial test. Also, a test in the umount code which was
surplus to requirements has been removed. Umounting
spectator mounts now works correctly, as expected.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c    |  2 +-
 fs/gfs2/recovery.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c3f2a5cc8efb..87778857f099 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1517,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
 	spin_unlock(&lru_lock);
 
 	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+	if (gl->gl_state != LM_ST_UNLOCKED)
 		handle_callback(gl, LM_ST_UNLOCKED, 0);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 666548e14596..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,12 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
 	int ro = 0;
 	unsigned int pass;
 	int error;
+	int jlocked = 0;
 
 	if (sdp->sd_args.ar_spectator ||
 	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
 		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
 			jd->jd_jid);
-
+		jlocked = 1;
 		/* Acquire the journal lock so we can do recovery */
 
 		error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -555,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
 			jd->jd_jid, t);
 	}
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
-		gfs2_glock_dq_uninit(&ji_gh);
-
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
 		gfs2_glock_dq_uninit(&j_gh);
+	}
 
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
 	goto done;
@@ -569,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
 fail_gunlock_tr:
 	gfs2_glock_dq_uninit(&t_gh);
 fail_gunlock_ji:
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+	if (jlocked) {
 		gfs2_glock_dq_uninit(&ji_gh);
 fail_gunlock_j:
 		gfs2_glock_dq_uninit(&j_gh);
-- 
cgit v1.2.3


From 43f74c199563a4273e528e2166d0650625a1e05f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Sep 2010 14:24:41 +0100
Subject: GFS2: Add "norecovery" mount option as a synonym for "spectator"

XFS supports the "norecovery" mount option which is basically the
same as the GFS2 spectator mode. This adds support for "norecovery"
as a synonym for spectator mode, which is hopefully a more obvious
description of what it actually does.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d85e0b7c1bb4..047d1176096c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
 	{Opt_locktable, "locktable=%s"},
 	{Opt_hostdata, "hostdata=%s"},
 	{Opt_spectator, "spectator"},
+	{Opt_spectator, "norecovery"},
 	{Opt_ignore_local_fs, "ignore_local_fs"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_localcaching, "localcaching"},
-- 
cgit v1.2.3


From feb47ca9314666d920855b8a235032dea2b2caa4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Sep 2010 15:04:18 +0100
Subject: GFS2: Improve journal allocation via sysfs

Recently a feature was added to GFS2 to allow journal id allocation
via sysfs. This patch builds upon that so that a negative journal id
will be treated as an error code to be passed back as the return code
from mount. This allows termination of the mount process if there is
a failure.

Also, the process has been updated so that the kernel will wait
for a journal id, even in the "spectator" case. This is required
in order to avoid mounting a filesystem in case there is an error
while joining the cluster. In the spectator case, 0 is written into
the file to indicate that all is well, and that mount should continue.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  2 +-
 fs/gfs2/ops_fstype.c | 16 ++++++++++++++--
 fs/gfs2/sys.c        | 17 +++++++++--------
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6f6ff8aba99a..764fbb49efc8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -494,7 +494,7 @@ struct gfs2_sb_host {
  */
 
 struct lm_lockstruct {
-	unsigned int ls_jid;
+	int ls_jid;
 	unsigned int ls_first;
 	unsigned int ls_first_done;
 	unsigned int ls_nodir;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5b11f3768144..aeafc233dc89 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1056,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
 
 static int wait_on_journal(struct gfs2_sbd *sdp)
 {
-	if (sdp->sd_args.ar_spectator)
-		return 0;
 	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		return 0;
 
@@ -1160,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	if (error)
 		goto fail_sb;
 
+	/*
+	 * If user space has failed to join the cluster or some similar
+	 * failure has occurred, then the journal id will contain a
+	 * negative (error) number. This will then be returned to the
+	 * caller (of the mount syscall). We do this even for spectator
+	 * mounts (which just write a jid of 0 to indicate "ok" even though
+	 * the jid is unused in the spectator case)
+	 */
+	if (sdp->sd_lockstruct.ls_jid < 0) {
+		error = sdp->sd_lockstruct.ls_jid;
+		sdp->sd_lockstruct.ls_jid = 0;
+		goto fail_sb;
+	}
+
 	error = init_inodes(sdp, DO);
 	if (error)
 		goto fail_sb;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..64082a5feae1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -399,31 +399,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 {
-	return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+	return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
 }
 
 static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-        unsigned jid;
+        int jid;
 	int rv;
 
-	rv = sscanf(buf, "%u", &jid);
+	rv = sscanf(buf, "%d", &jid);
 	if (rv != 1)
 		return -EINVAL;
 
 	spin_lock(&sdp->sd_jindex_spin);
 	rv = -EINVAL;
-	if (sdp->sd_args.ar_spectator)
-		goto out;
 	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		goto out;
 	rv = -EBUSY;
-	if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
 		goto out;
+	rv = 0;
+	if (sdp->sd_args.ar_spectator && jid > 0)
+		rv = jid = -EINVAL;
 	sdp->sd_lockstruct.ls_jid = jid;
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
-	rv = 0;
 out:
 	spin_unlock(&sdp->sd_jindex_spin);
 	return rv ? rv : len;
@@ -617,7 +618,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
 	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
-		add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+		add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
 	if (gfs2_uuid_valid(uuid))
 		add_uevent_var(env, "UUID=%pUB", uuid);
 	return 0;
-- 
cgit v1.2.3


From 5c78f58e2d5cef65c255a556184f1f43c8d84c84 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 27 Sep 2010 15:51:20 -0400
Subject: NFS: Really fix put_nfs_open_context()

In nfs_open_revalidate(), if the open_context() call returns an inode that
is not the same as dentry->d_inode, then we will call
put_nfs_open_context() with a valid dentry->d_inode, but without the
context being part of the nfsi->open_files list.

In this case too, we want to just skip the list removal, but we do want to
call the ->close_context() callback in order to close the NFSv4 state.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 702ed096e790..18be041abd23 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -639,6 +639,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr
 		ctx->dir_cookie = 0;
 		nfs_init_lock_context(&ctx->lock_context);
 		ctx->lock_context.open_context = ctx;
+		INIT_LIST_HEAD(&ctx->list);
 	}
 	return ctx;
 }
@@ -654,14 +655,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
 	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (inode) {
+	if (!list_empty(&ctx->list)) {
 		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
 			return;
 		list_del(&ctx->list);
 		spin_unlock(&inode->i_lock);
-		NFS_PROTO(inode)->close_context(ctx, is_sync);
 	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
 		return;
+	if (inode != NULL)
+		NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
-- 
cgit v1.2.3


From a00dd6c03dd97a777c291a8af8682be4b5fadf8d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 28 Sep 2010 09:14:01 -0400
Subject: NFS: don't use FLUSH_SYNC on WB_SYNC_NONE COMMIT calls (try #2)

WB_SYNC_NONE is supposed to mean "don't wait on anything". That should
also include not waiting for COMMIT calls to complete.

WB_SYNC_NONE is also implied when wbc->nonblocking and
wbc->for_background are set, so we can replace those checks in
nfs_commit_unstable_pages with a check for WB_SYNC_NONE.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4d6d35dd2b4b..605e292501f4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1431,15 +1431,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	int flags = FLUSH_SYNC;
 	int ret = 0;
 
-	/* Don't commit yet if this is a non-blocking flush and there are
-	 * lots of outstanding writes for this mapping.
-	 */
-	if (wbc->sync_mode == WB_SYNC_NONE &&
-	    nfsi->ncommit <= (nfsi->npages >> 1))
-		goto out_mark_dirty;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		/* Don't commit yet if this is a non-blocking flush and there
+		 * are a lot of outstanding writes for this mapping.
+		 */
+		if (nfsi->ncommit <= (nfsi->npages >> 1))
+			goto out_mark_dirty;
 
-	if (wbc->nonblocking || wbc->for_background)
+		/* don't wait for the COMMIT response */
 		flags = 0;
+	}
+
 	ret = nfs_commit_inode(inode, flags);
 	if (ret >= 0) {
 		if (wbc->sync_mode == WB_SYNC_NONE) {
-- 
cgit v1.2.3


From a347ecb209b58a1b37f20d8299ab552f7d3ee8c3 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Fri, 17 Sep 2010 19:43:10 +0530
Subject: cifs: use type __u32 instead of int for the oplock parameter

... and avoid implicit casting from a signed type. Also, pass oplock by value
instead by reference as we don't intend to change the value in
cifs_open_inode_helper().

Thanks to Jeff Layton for spotting this.

Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de748c652d11..d33da45fe987 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -165,7 +165,7 @@ psx_client_can_cache:
 
 /* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode,
-	struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
+	struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
 	char *full_path, int xid)
 {
 	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
@@ -207,11 +207,11 @@ client_can_cache:
 		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
 					 xid, NULL);
 
-	if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
 		pCifsInode->clientCanCacheRead = true;
 		cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-	} else if ((*oplock & 0xF) == OPLOCK_READ)
+	} else if ((oplock & 0xF) == OPLOCK_READ)
 		pCifsInode->clientCanCacheRead = true;
 
 	return rc;
@@ -365,7 +365,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+	rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
 	if (rc != 0)
 		goto out;
 
-- 
cgit v1.2.3


From aa91c7e4ab9b0842b7d7a7cbf8cca18b20df89b5 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Fri, 17 Sep 2010 18:56:39 +0530
Subject: cifs: fix broken oplock handling

cifs_new_fileinfo() does not use the 'oplock' value from the callers. Instead,
it sets it to REQ_OPLOCK which seems wrong. We should be using the oplock value
obtained from the Server to set the inode's clientCanCacheAll or
clientCanCacheRead flags. Fix this by passing oplock from the callers to
cifs_new_fileinfo().

This change dates back to commit a6ce4932 (2.6.30-rc3). So, all the affected
versions will need this fix. Please Cc stable once reviewed and accepted.

Cc: Stable <stable@kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  3 ++-
 fs/cifs/dir.c       | 12 +++++-------
 fs/cifs/file.c      |  4 ++--
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1d60c655e3e0..f110e0e7e947 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -107,7 +107,8 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 
 extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
-				struct vfsmount *mnt, unsigned int oflags);
+				struct vfsmount *mnt, unsigned int oflags,
+				__u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
 				int mode, int oflags,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f9ed0751cc12..0f947bf73f8e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -132,9 +132,9 @@ cifs_bp_rename_retry:
 
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
-		  struct file *file, struct vfsmount *mnt, unsigned int oflags)
+		  struct file *file, struct vfsmount *mnt, unsigned int oflags,
+		  __u32 oplock)
 {
-	int oplock = 0;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
@@ -143,9 +143,6 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 	if (pCifsFile == NULL)
 		return pCifsFile;
 
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
-
 	pCifsFile->netfid = fileHandle;
 	pCifsFile->pid = current->tgid;
 	pCifsFile->pInode = igrab(newinode);
@@ -468,7 +465,7 @@ cifs_create_set_dentry:
 		}
 
 		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-					       nd->path.mnt, oflags);
+					       nd->path.mnt, oflags, oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
 			CIFSSMBClose(xid, tcon, fileHandle);
@@ -729,7 +726,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
 						  nd->path.mnt,
-						  nd->intent.open.flags);
+						  nd->intent.open.flags,
+						  oplock);
 			if (cfile == NULL) {
 				fput(filp);
 				CIFSSMBClose(xid, pTcon, fileHandle);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d33da45fe987..60061b9c2f67 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -277,7 +277,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
 							file->f_path.mnt,
-							oflags);
+							oflags, oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
 				rc = -ENOMEM;
@@ -370,7 +370,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 
 	pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
-					file->f_flags);
+					file->f_flags, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
-- 
cgit v1.2.3


From 5f98ca9afb9c004f8948c0d40920503de447918a Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Sat, 18 Sep 2010 22:01:58 -0500
Subject: cifs NTLMv2/NTLMSSP Change variable name mac_key to session key to
 reflect the key it holds

Change name of variable mac_key to session key.
The reason mac_key was changed to session key is, this structure does not
hold message authentication code, it holds the session key (for ntlmv2,
ntlmv1 etc.).  mac is generated as a signature in cifs_calc* functions.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 22 +++++++++++-----------
 fs/cifs/cifsglob.h    |  4 ++--
 fs/cifs/cifsproto.h   |  4 ++--
 fs/cifs/sess.c        | 10 +++++-----
 fs/cifs/transport.c   |  6 +++---
 5 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35042d8f7338..eed70cae1275 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -42,7 +42,7 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 		       unsigned char *p24);
 
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-				    const struct mac_key *key, char *signature)
+				const struct session_key *key, char *signature)
 {
 	struct	MD5Context context;
 
@@ -78,7 +78,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
+	rc = cifs_calculate_signature(cifs_pdu, &server->session_key,
 				      smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
@@ -89,7 +89,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-				const struct mac_key *key, char *signature)
+				const struct session_key *key, char *signature)
 {
 	struct  MD5Context context;
 	int i;
@@ -145,7 +145,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
+	rc = cifs_calc_signature2(iov, n_vec, &server->session_key,
 				      smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
@@ -156,14 +156,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-			  const struct mac_key *mac_key,
+			  const struct session_key *session_key,
 			  __u32 expected_sequence_number)
 {
 	unsigned int rc;
 	char server_response_sig[8];
 	char what_we_think_sig_should_be[20];
 
-	if ((cifs_pdu == NULL) || (mac_key == NULL))
+	if (cifs_pdu == NULL || session_key == NULL)
 		return -EINVAL;
 
 	if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +192,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 					cpu_to_le32(expected_sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	rc = cifs_calculate_signature(cifs_pdu, mac_key,
+	rc = cifs_calculate_signature(cifs_pdu, session_key,
 		what_we_think_sig_should_be);
 
 	if (rc)
@@ -209,7 +209,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
 
 /* We fill in key by putting in 40 byte array which was allocated by caller */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+int cifs_calculate_session_key(struct session_key *key, const char *rn,
 			   const char *password)
 {
 	char temp_key[16];
@@ -347,11 +347,11 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 	/* now calculate the MAC key for NTLMv2 */
 	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
 	hmac_md5_update(resp_buf, 16, &context);
-	hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
+	hmac_md5_final(ses->server->session_key.data.ntlmv2.key, &context);
 
-	memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
+	memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
 	       sizeof(struct ntlmv2_resp));
-	ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+	ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
 }
 
 void CalcNTLMv2_response(const struct cifsSesInfo *ses,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..14dfa9a067e5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -97,7 +97,7 @@ enum protocolEnum {
 	/* Netbios frames protocol not supported at this time */
 };
 
-struct mac_key {
+struct session_key {
 	unsigned int len;
 	union {
 		char ntlm[CIFS_SESS_KEY_SIZE + 16];
@@ -182,7 +182,7 @@ struct TCP_Server_Info {
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* needed for CIFS PDU signature */
-	struct mac_key mac_signing_key;
+	struct session_key session_key;
 	char ntlmv2_hash[16];
 	unsigned long lstrp; /* when we got last response from this server */
 	u16 dialect; /* dialect index that server chose */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f110e0e7e947..099fd6173e01 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -363,9 +363,9 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-				 const struct mac_key *mac_key,
+				 const struct session_key *session_key,
 				__u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
 				 const char *pass);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
 extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..88820127650e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -480,7 +480,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	/* calculate session key,  BB what about adding similar ntlmv2 path? */
 	SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
 	if (first)
-		cifs_calculate_mac_key(&ses->server->mac_signing_key,
+		cifs_calculate_session_key(&ses->server->session_key,
 				       ntlm_session_key, ses->password);
 
 	memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
@@ -690,7 +690,7 @@ ssetup_ntlmssp_authenticate:
 
 		if (first_time) /* should this be moved into common code
 				  with similar ntlmv2 path? */
-			cifs_calculate_mac_key(&ses->server->mac_signing_key,
+			cifs_calculate_session_key(&ses->server->session_key,
 				ntlm_session_key, ses->password);
 		/* copy session key */
 
@@ -765,15 +765,15 @@ ssetup_ntlmssp_authenticate:
 		}
 		/* bail out if key is too long */
 		if (msg->sesskey_len >
-		    sizeof(ses->server->mac_signing_key.data.krb5)) {
+		    sizeof(ses->server->session_key.data.krb5)) {
 			cERROR(1, "Kerberos signing key too long (%u bytes)",
 				msg->sesskey_len);
 			rc = -EOVERFLOW;
 			goto ssetup_exit;
 		}
 		if (first_time) {
-			ses->server->mac_signing_key.len = msg->sesskey_len;
-			memcpy(ses->server->mac_signing_key.data.krb5,
+			ses->server->session_key.len = msg->sesskey_len;
+			memcpy(ses->server->session_key.data.krb5,
 				msg->data, msg->sesskey_len);
 		}
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..a66c91eb6eb4 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(midQ->resp_buf,
-						&ses->server->mac_signing_key,
+						&ses->server->session_key,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(out_buf,
-						&ses->server->mac_signing_key,
+						&ses->server->session_key,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 				     SECMODE_SIGN_ENABLED))) {
 		rc = cifs_verify_signature(out_buf,
-					   &ses->server->mac_signing_key,
+					   &ses->server->session_key,
 					   midQ->sequence_number+1);
 		if (rc) {
 			cERROR(1, "Unexpected SMB signature");
-- 
cgit v1.2.3


From 2b149f11978b44199954710d32c0eecf6c9efd9c Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Sat, 18 Sep 2010 22:02:18 -0500
Subject: cifs NTLMv2/NTLMSSP ntlmv2 within ntlmssp autentication code

Attribue Value (AV) pairs or Target Info (TI) pairs are part of
ntlmv2 authentication.
Structure ntlmv2_resp had only definition for two av pairs.
So removed it, and now allocation of av pairs is dynamic.
For servers like Windows 7/2008, av pairs sent by server in
challege packet (type 2 in the ntlmssp exchange/negotiation) can
vary.

Server sends them during ntlmssp negotiation. So when ntlmssp is used
as an authentication mechanism, type 2 challenge packet from server
has this information.  Pluck it and use the entire blob for
authenticaiton purpose.  If user has not specified, extract
(netbios) domain name from the av pairs which is used to calculate
ntlmv2 hash.  Servers like Windows 7 are particular about the AV pair
blob.

Servers like Windows 2003, are not very strict about the contents
of av pair blob used during ntlmv2 authentication.
So when security mechanism such as ntlmv2 is used (not ntlmv2 in ntlmssp),
there is no negotiation and so genereate a minimal blob that gets
used in ntlmv2 authentication as well as gets sent.

Fields tilen and tilbob are session specific.  AV pair values are defined.

To calculate ntlmv2 response we need ti/av pair blob.

For sec mech like ntlmssp, the blob is plucked from type 2 response from
the server.  From this blob, netbios name of the domain is retrieved,
if user has not already provided, to be included in the Target String
as part of ntlmv2 hash calculations.

For sec mech like ntlmv2, create a minimal, two av pair blob.

The allocated blob is freed in case of error.  In case there is no error,
this blob is used in calculating ntlmv2 response (in CalcNTLMv2_response)
and is also copied on the response to the server, and then freed.

The type 3 ntlmssp response is prepared on a buffer,
5 * sizeof of struct _AUTHENTICATE_MESSAGE, an empirical value large
enough to hold _AUTHENTICATE_MESSAGE plus a blob with max possible
10 values as part of ntlmv2 response and lmv2 keys and domain, user,
workstation  names etc.

Also, kerberos gets selected as a default mechanism if server supports it,
over the other security mechanisms.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/cifs/cifsglob.h    |   2 +
 fs/cifs/cifspdu.h     |   1 -
 fs/cifs/cifsproto.h   |   2 +-
 fs/cifs/cifssmb.c     |  16 ++++---
 fs/cifs/connect.c     |   2 +
 fs/cifs/ntlmssp.h     |  15 +++++++
 fs/cifs/sess.c        | 119 +++++++++++++++++++++++++++++++++----------------
 8 files changed, 225 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index eed70cae1275..730038a12982 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
 
@@ -262,6 +263,87 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
+/* This is just a filler for ntlmv2 type of security mechanisms.
+ * Older servers are not very particular about the contents of av pairs
+ * in the blob and for sec mechs like ntlmv2, there is no negotiation
+ * as in ntlmssp, so unless domain and server  netbios and dns names
+ * are specified, there is no way to obtain name.  In case of ntlmssp,
+ * server provides that info in type 2 challenge packet
+ */
+static int
+build_avpair_blob(struct cifsSesInfo *ses)
+{
+	struct ntlmssp2_name *attrptr;
+
+	ses->tilen = 2 * sizeof(struct ntlmssp2_name);
+	ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL);
+	if (!ses->tiblob) {
+		ses->tilen = 0;
+		cERROR(1, "Challenge target info allocation failure");
+		return -ENOMEM;
+	}
+	attrptr = (struct ntlmssp2_name *) ses->tiblob;
+	attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
+
+	return 0;
+}
+
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find netbios domain name to be used
+ * as part of ntlmv2 authentication (in Target String), if not already
+ * specified on the command line.
+ * If this function returns without any error but without fetching
+ * domain name, authentication may fail against some server but
+ * may not fail against other (those who are not very particular
+ * about target string i.e. for some, just user name might suffice.
+ */
+static int
+find_domain_name(struct cifsSesInfo *ses)
+{
+	unsigned int attrsize;
+	unsigned int type;
+	unsigned int onesize = sizeof(struct ntlmssp2_name);
+	unsigned char *blobptr;
+	unsigned char *blobend;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->tilen || !ses->tiblob)
+		return 0;
+
+	blobptr = ses->tiblob;
+	blobend = ses->tiblob + ses->tilen;
+
+	while (blobptr + onesize < blobend) {
+		attrptr = (struct ntlmssp2_name *) blobptr;
+		type = le16_to_cpu(attrptr->type);
+		if (type == NTLMSSP_AV_EOL)
+			break;
+		blobptr += 2; /* advance attr type */
+		attrsize = le16_to_cpu(attrptr->length);
+		blobptr += 2; /* advance attr size */
+		if (blobptr + attrsize > blobend)
+			break;
+		if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+			if (!attrsize)
+				break;
+			if (!ses->domainName) {
+				ses->domainName =
+					kmalloc(attrsize + 1, GFP_KERNEL);
+				if (!ses->domainName)
+						return -ENOMEM;
+				cifs_from_ucs2(ses->domainName,
+					(__le16 *)blobptr, attrsize, attrsize,
+					load_nls_default(), false);
+				break;
+			}
+		}
+		blobptr += attrsize; /* advance attr  value */
+	}
+
+	return 0;
+}
+
 static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 			    const struct nls_table *nls_cp)
 {
@@ -321,7 +403,8 @@ calc_exit_2:
 	return rc;
 }
 
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 		      const struct nls_table *nls_cp)
 {
 	int rc;
@@ -333,15 +416,29 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
 	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
 	buf->reserved2 = 0;
-	buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
-	buf->names[0].length = 0;
-	buf->names[1].type = 0;
-	buf->names[1].length = 0;
+
+	if (ses->server->secType == RawNTLMSSP) {
+		if (!ses->domainName) {
+			rc = find_domain_name(ses);
+			if (rc) {
+				cERROR(1, "error %d finding domain name", rc);
+				goto setup_ntlmv2_rsp_ret;
+			}
+		}
+	} else {
+		rc = build_avpair_blob(ses);
+		if (rc) {
+			cERROR(1, "error %d building av pair blob", rc);
+			return rc;
+		}
+	}
 
 	/* calculate buf->ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, nls_cp);
-	if (rc)
+	if (rc) {
 		cERROR(1, "could not get v2 hash rc %d", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
 	CalcNTLMv2_response(ses, resp_buf);
 
 	/* now calculate the MAC key for NTLMv2 */
@@ -352,6 +449,15 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 	memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
 	       sizeof(struct ntlmv2_resp));
 	ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
+
+	return 0;
+
+setup_ntlmv2_rsp_ret:
+	kfree(ses->tiblob);
+	ses->tiblob = NULL;
+	ses->tilen = 0;
+
+	return rc;
 }
 
 void CalcNTLMv2_response(const struct cifsSesInfo *ses,
@@ -365,6 +471,9 @@ void CalcNTLMv2_response(const struct cifsSesInfo *ses,
 	hmac_md5_update(v2_session_response+8,
 			sizeof(struct ntlmv2_resp) - 8, &context);
 
+	if (ses->tilen)
+		hmac_md5_update(ses->tiblob, ses->tilen, &context);
+
 	hmac_md5_final(v2_session_response, &context);
 /*	cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 14dfa9a067e5..c68f31cf4550 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -222,6 +222,8 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
+	unsigned int tilen; /* length of the target info blob */
+	unsigned char *tiblob; /* target info blob in challenge response */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..b0f4b5656d4c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -663,7 +663,6 @@ struct ntlmv2_resp {
 	__le64  time;
 	__u64  client_chal; /* random */
 	__u32  reserved2;
-	struct ntlmssp2_name names[2];
 	/* array of name entries could follow ending in minimum 4 byte struct */
 } __attribute__((packed));
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 099fd6173e01..588612867451 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -368,7 +368,7 @@ extern int cifs_verify_signature(struct smb_hdr *,
 extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
 				 const char *pass);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
 			     const struct nls_table *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..13c854e41073 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -603,13 +603,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 				rc = 0;
 			else
 				rc = -EINVAL;
-
-			if (server->sec_kerberos || server->sec_mskerberos)
-				server->secType = Kerberos;
-			else if (server->sec_ntlmssp)
-				server->secType = RawNTLMSSP;
-			else
-				rc = -EOPNOTSUPP;
+			if (server->secType == Kerberos) {
+				if (!server->sec_kerberos &&
+						!server->sec_mskerberos)
+					rc = -EOPNOTSUPP;
+			} else if (server->secType == RawNTLMSSP) {
+				if (!server->sec_ntlmssp)
+					rc = -EOPNOTSUPP;
+			} else
+					rc = -EOPNOTSUPP;
 		}
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88c84a38bccb..c99760a523bf 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1740,6 +1740,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	if (ses == NULL)
 		goto get_ses_fail;
 
+	ses->tilen = 0;
+	ses->tiblob = NULL;
 	/* new SMB session uses our server ref */
 	ses->server = server;
 	if (server->addr.sockAddr6.sin6_family == AF_INET6)
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..5d52e4a3b1ed 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
 #define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
 #define NTLMSSP_NEGOTIATE_56        0x80000000
 
+/* Define AV Pair Field IDs */
+enum av_field_type {
+	NTLMSSP_AV_EOL = 0,
+	NTLMSSP_AV_NB_COMPUTER_NAME,
+	NTLMSSP_AV_NB_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_COMPUTER_NAME,
+	NTLMSSP_AV_DNS_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_TREE_NAME,
+	NTLMSSP_AV_FLAGS,
+	NTLMSSP_AV_TIMESTAMP,
+	NTLMSSP_AV_RESTRICTION,
+	NTLMSSP_AV_TARGET_NAME,
+	NTLMSSP_AV_CHANNEL_BINDINGS
+};
+
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
 /* to more closely match the standards document for NTLMSSP from     */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 88820127650e..af18a500f7e0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 				    struct cifsSesInfo *ses)
 {
+	unsigned int tioffset; /* challenge message target info area */
+	unsigned int tilen; /* challenge message target info area length  */
+
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -405,6 +408,19 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
 
+	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+	ses->tilen = tilen;
+	if (ses->tilen) {
+		ses->tiblob = kmalloc(tilen, GFP_KERNEL);
+		if (!ses->tiblob) {
+			cERROR(1, "Challenge target info allocation failure");
+			ses->tilen = 0;
+			return -ENOMEM;
+		}
+		memcpy(ses->tiblob,  bcc_ptr + tioffset, ses->tilen);
+	}
+
 	return 0;
 }
 
@@ -425,7 +441,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	/* BB is NTLMV2 session security format easier to use here? */
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM;
 	if (ses->server->secMode &
 	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -449,12 +465,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
    This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 				   struct cifsSesInfo *ses,
-				   const struct nls_table *nls_cp, bool first)
+				   const struct nls_table *nls_cp)
 {
+	int rc;
+	unsigned int size;
 	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
 	__u32 flags;
 	unsigned char *tmp;
-	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+	struct ntlmv2_resp ntlmv2_response = {};
 
 	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
 	sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +480,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	flags = NTLMSSP_NEGOTIATE_56 |
 		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM;
 	if (ses->server->secMode &
 	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -477,19 +495,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->LmChallengeResponse.Length = 0;
 	sec_blob->LmChallengeResponse.MaximumLength = 0;
 
-	/* calculate session key,  BB what about adding similar ntlmv2 path? */
-	SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
-	if (first)
-		cifs_calculate_session_key(&ses->server->session_key,
-				       ntlm_session_key, ses->password);
-
-	memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
 	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-	sec_blob->NtChallengeResponse.MaximumLength =
-				cpu_to_le16(CIFS_SESS_KEY_SIZE);
+	rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
+	if (rc) {
+		cERROR(1, "Error %d during NTLMSSP authentication", rc);
+		goto setup_ntlmv2_ret;
+	}
+	size =  sizeof(struct ntlmv2_resp);
+	memcpy(tmp, (char *)&ntlmv2_response, size);
+	tmp += size;
+	if (ses->tilen > 0) {
+		memcpy(tmp, ses->tiblob, ses->tilen);
+		tmp += ses->tilen;
+	}
 
-	tmp += CIFS_SESS_KEY_SIZE;
+	sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + ses->tilen);
+	sec_blob->NtChallengeResponse.MaximumLength =
+				cpu_to_le16(size + ses->tilen);
+	kfree(ses->tiblob);
+	ses->tiblob = NULL;
+	ses->tilen = 0;
 
 	if (ses->domainName == NULL) {
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +526,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
 				    MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
-		len += 2; /* trailing null */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
 		sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +542,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
 				    MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
-		len += 2; /* trailing null */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
 		sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -533,6 +556,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
 	sec_blob->SessionKey.Length = 0;
 	sec_blob->SessionKey.MaximumLength = 0;
+
+setup_ntlmv2_ret:
 	return tmp - pbuffer;
 }
 
@@ -545,19 +570,6 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 
 	return;
 }
-
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
-				  struct cifsSesInfo *ses,
-				  const struct nls_table *nls, bool first_time)
-{
-	int bloblen;
-
-	bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
-					  first_time);
-	pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
-
-	return bloblen;
-}
 #endif
 
 int
@@ -580,6 +592,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 	bool first_time;
+	int blob_len;
+	char *ntlmsspblob = NULL;
 
 	if (ses == NULL)
 		return -EINVAL;
@@ -729,12 +743,24 @@ ssetup_ntlmssp_authenticate:
 			cpu_to_le16(sizeof(struct ntlmv2_resp));
 
 		/* calculate session key */
-		setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
-		/* FIXME: calculate MAC key */
+		rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+		if (rc) {
+			cERROR(1, "Error %d during NTLMv2 authentication", rc);
+			kfree(v2_sess_key);
+			goto ssetup_exit;
+		}
 		memcpy(bcc_ptr, (char *)v2_sess_key,
-		       sizeof(struct ntlmv2_resp));
+				sizeof(struct ntlmv2_resp));
 		bcc_ptr += sizeof(struct ntlmv2_resp);
 		kfree(v2_sess_key);
+		if (ses->tilen > 0) {
+			memcpy(bcc_ptr, ses->tiblob, ses->tilen);
+			bcc_ptr += ses->tilen;
+			/* we never did allocate ses->domainName to free */
+			kfree(ses->tiblob);
+			ses->tiblob = NULL;
+			ses->tilen = 0;
+		}
 		if (ses->capabilities & CAP_UNICODE) {
 			if (iov[0].iov_len % 2) {
 				*bcc_ptr = 0;
@@ -815,12 +841,28 @@ ssetup_ntlmssp_authenticate:
 			if (phase == NtLmNegotiate) {
 				setup_ntlmssp_neg_req(pSMB, ses);
 				iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+				iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			} else if (phase == NtLmAuthenticate) {
-				int blob_len;
-				blob_len = setup_ntlmssp_auth_req(pSMB, ses,
-								  nls_cp,
-								  first_time);
+				/* 5 is an empirical value, large enought to
+				 * hold authenticate message, max 10 of
+				 * av paris, doamin,user,workstation mames,
+				 * flags etc..
+				 */
+				ntlmsspblob = kmalloc(
+					5*sizeof(struct _AUTHENTICATE_MESSAGE),
+					GFP_KERNEL);
+				if (!ntlmsspblob) {
+					cERROR(1, "Can't allocate NTLMSSP");
+					rc = -ENOMEM;
+					goto ssetup_exit;
+				}
+
+				blob_len = build_ntlmssp_auth_blob(ntlmsspblob,
+							ses, nls_cp);
 				iov[1].iov_len = blob_len;
+				iov[1].iov_base = ntlmsspblob;
+				pSMB->req.SecurityBlobLength =
+					cpu_to_le16(blob_len);
 				/* Make sure that we tell the server that we
 				   are using the uid that it just gave us back
 				   on the response (challenge) */
@@ -830,7 +872,6 @@ ssetup_ntlmssp_authenticate:
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
-			iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			/* unicode strings must be word aligned */
 			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 				*bcc_ptr = 0;
@@ -931,6 +972,8 @@ ssetup_exit:
 		key_put(spnego_key);
 	}
 	kfree(str_area);
+	kfree(ntlmsspblob);
+	ntlmsspblob = NULL;
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
 		cifs_small_buf_release(iov[0].iov_base);
-- 
cgit v1.2.3


From 3eb9a8893a76cf1cda3b41c3212eb2cfe83eae0e Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Wed, 1 Sep 2010 17:06:02 -0700
Subject: cifs: Allow binding to local IP address.

When using multi-homed machines, it's nice to be able to specify
the local IP to use for outbound connections.  This patch gives
cifs the ability to bind to a particular IP address.

   Usage:  mount -t cifs -o srcaddr=192.168.1.50,user=foo, ...
   Usage:  mount -t cifs -o srcaddr=2002::100:1,user=foo, ...

Acked-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Dr. David Holder <david.holder@erion.co.uk>
Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   | 19 ++++++++++++
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 108 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..1b6ddd6f760f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -36,6 +36,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/smp_lock.h>
+#include <net/ipv6.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -367,6 +368,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
 	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct sockaddr *srcaddr;
+	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
 	seq_printf(s, ",unc=%s", tcon->treeName);
 	if (tcon->ses->userName)
@@ -374,6 +377,22 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 	if (tcon->ses->domainName)
 		seq_printf(s, ",domain=%s", tcon->ses->domainName);
 
+	if (srcaddr->sa_family != AF_UNSPEC) {
+		struct sockaddr_in *saddr4;
+		struct sockaddr_in6 *saddr6;
+		saddr4 = (struct sockaddr_in *)srcaddr;
+		saddr6 = (struct sockaddr_in6 *)srcaddr;
+		if (srcaddr->sa_family == AF_INET6)
+			seq_printf(s, ",srcaddr=%pI6c",
+				   &saddr6->sin6_addr);
+		else if (srcaddr->sa_family == AF_INET)
+			seq_printf(s, ",srcaddr=%pI4",
+				   &saddr4->sin_addr.s_addr);
+		else
+			seq_printf(s, ",srcaddr=BAD-AF:%i",
+				   (int)(srcaddr->sa_family));
+	}
+
 	seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
 		seq_printf(s, ",forceuid");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c68f31cf4550..6ef0efaf68d4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -139,6 +139,7 @@ struct TCP_Server_Info {
 		struct sockaddr_in sockAddr;
 		struct sockaddr_in6 sockAddr6;
 	} addr;
+	struct sockaddr_storage srcaddr; /* locally bind to this IP */
 	wait_queue_head_t response_q;
 	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
 	struct list_head pending_mid_q;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c99760a523bf..fa884520fb84 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -105,6 +105,7 @@ struct smb_vol {
 	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
 	char *prepath;
+	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
 };
 
@@ -1046,6 +1047,22 @@ cifs_parse_mount_options(char *options, const char *devname,
 						    "long\n");
 				return 1;
 			}
+		} else if (strnicmp(data, "srcaddr", 7) == 0) {
+			vol->srcaddr.ss_family = AF_UNSPEC;
+
+			if (!value || !*value) {
+				printk(KERN_WARNING "CIFS: srcaddr value"
+				       " not specified.\n");
+				return 1;	/* needs_arg; */
+			}
+			i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
+						 value, strlen(value));
+			if (i < 0) {
+				printk(KERN_WARNING "CIFS:  Could not parse"
+				       " srcaddr: %s\n",
+				       value);
+				return 1;
+			}
 		} else if (strnicmp(data, "prefixpath", 10) == 0) {
 			if (!value || !*value) {
 				printk(KERN_WARNING
@@ -1374,8 +1391,36 @@ cifs_parse_mount_options(char *options, const char *devname,
 	return 0;
 }
 
+/** Returns true if srcaddr isn't specified and rhs isn't
+ * specified, or if srcaddr is specified and
+ * matches the IP address of the rhs argument.
+ */
+static bool
+srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+	switch (srcaddr->sa_family) {
+	case AF_UNSPEC:
+		return (rhs->sa_family == AF_UNSPEC);
+	case AF_INET: {
+		struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+		struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+		return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+		return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
+}
+
+
 static bool
-match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
+	      struct sockaddr *srcaddr)
 {
 	struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
 	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
@@ -1402,6 +1447,9 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
 		break;
 	}
 
+	if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
+		return false;
+
 	return true;
 }
 
@@ -1469,7 +1517,8 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (!match_address(server, addr))
+		if (!match_address(server, addr,
+				   (struct sockaddr *)&vol->srcaddr))
 			continue;
 
 		if (!match_security(server, vol))
@@ -1584,6 +1633,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	 * no need to spinlock this init of tcpStatus or srv_count
 	 */
 	tcp_ses->tcpStatus = CifsNew;
+	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+	       sizeof(tcp_ses->srcaddr));
 	++tcp_ses->srv_count;
 
 	if (addr.ss_family == AF_INET6) {
@@ -1999,6 +2050,33 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 
 }
 
+static int
+bind_socket(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	if (server->srcaddr.ss_family != AF_UNSPEC) {
+		/* Bind to the specified local IP address */
+		struct socket *socket = server->ssocket;
+		rc = socket->ops->bind(socket,
+				       (struct sockaddr *) &server->srcaddr,
+				       sizeof(server->srcaddr));
+		if (rc < 0) {
+			struct sockaddr_in *saddr4;
+			struct sockaddr_in6 *saddr6;
+			saddr4 = (struct sockaddr_in *)&server->srcaddr;
+			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
+			if (saddr6->sin6_family == AF_INET6)
+				cERROR(1, "cifs: "
+				       "Failed to bind to: %pI6c, error: %d\n",
+				       &saddr6->sin6_addr, rc);
+			else
+				cERROR(1, "cifs: "
+				       "Failed to bind to: %pI4, error: %d\n",
+				       &saddr4->sin_addr.s_addr, rc);
+		}
+	}
+	return rc;
+}
 
 static int
 ipv4_connect(struct TCP_Server_Info *server)
@@ -2024,6 +2102,10 @@ ipv4_connect(struct TCP_Server_Info *server)
 		cifs_reclassify_socket4(socket);
 	}
 
+	rc = bind_socket(server);
+	if (rc < 0)
+		return rc;
+
 	/* user overrode default port */
 	if (server->addr.sockAddr.sin_port) {
 		rc = socket->ops->connect(socket, (struct sockaddr *)
@@ -2186,6 +2268,10 @@ ipv6_connect(struct TCP_Server_Info *server)
 		cifs_reclassify_socket6(socket);
 	}
 
+	rc = bind_socket(server);
+	if (rc < 0)
+		return rc;
+
 	/* user overrode default port */
 	if (server->addr.sockAddr6.sin6_port) {
 		rc = socket->ops->connect(socket,
-- 
cgit v1.2.3


From c69c1b6eaea1b3e1eecf7ad2fba0208ac4a11131 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Sat, 31 Jul 2010 09:14:16 +0200
Subject: cifs: implement CIFSParseMFSymlink()

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/link.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca8033656..12eb491d4c52 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,68 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "md5.h"
+
+#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
+#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
+#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
+#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
+#define CIFS_MF_SYMLINK_FILE_SIZE \
+	(CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
+
+#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
+#define CIFS_MF_SYMLINK_MD5_FORMAT \
+	"%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
+	md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
+	md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
+	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
+	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+
+static int
+CIFSParseMFSymlink(const u8 *buf,
+		   unsigned int buf_len,
+		   unsigned int *_link_len,
+		   char **_link_str)
+{
+	int rc;
+	unsigned int link_len;
+	const char *md5_str1;
+	const char *link_str;
+	struct MD5Context md5_ctx;
+	u8 md5_hash[16];
+	char md5_str2[34];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
+	link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
+
+	rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
+	if (rc != 1)
+		return -EINVAL;
+
+	cifs_MD5_init(&md5_ctx);
+	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+	cifs_MD5_final(md5_hash, &md5_ctx);
+
+	snprintf(md5_str2, sizeof(md5_str2),
+		 CIFS_MF_SYMLINK_MD5_FORMAT,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	if (strncmp(md5_str1, md5_str2, 17) != 0)
+		return -EINVAL;
+
+	if (_link_str) {
+		*_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
+		if (!*_link_str)
+			return -ENOMEM;
+	}
+
+	*_link_len = link_len;
+	return 0;
+}
 
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
-- 
cgit v1.2.3


From 8bfb50a882ccd9804929876470f74edcb23d2326 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Sat, 31 Jul 2010 09:15:10 +0200
Subject: cifs: implement CIFSCouldBeMFSymlink() and CIFSCheckMFSymlink()

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  4 +++
 fs/cifs/link.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 588612867451..8604a45c1107 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -409,4 +409,8 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
 			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
+extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
+extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+		const unsigned char *path,
+		struct cifs_sb_info *cifs_sb, int xid);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 12eb491d4c52..bec212b09a02 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -91,6 +91,85 @@ CIFSParseMFSymlink(const u8 *buf,
 	return 0;
 }
 
+bool
+CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
+{
+	if (!(fattr->cf_mode & S_IFREG))
+		/* it's not a symlink */
+		return false;
+
+	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+		/* it's not a symlink */
+		return false;
+
+	return true;
+}
+
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+		   const unsigned char *path,
+		   struct cifs_sb_info *cifs_sb, int xid)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	u8 *buf;
+	char *pbuf;
+	unsigned int bytes_read = 0;
+	int buf_type = CIFS_NO_BUFFER;
+	unsigned int link_len = 0;
+	FILE_ALL_INFO file_info;
+
+	if (!CIFSCouldBeMFSymlink(fattr))
+		/* it's not a symlink */
+		return 0;
+
+	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+			 cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		return rc;
+
+	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+		CIFSSMBClose(xid, pTcon, netfid);
+		/* it's not a symlink */
+		return 0;
+	}
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	pbuf = buf;
+
+	rc = CIFSSMBRead(xid, pTcon, netfid,
+			 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+			 0 /* offset */,
+			 &bytes_read, &pbuf, &buf_type);
+	CIFSSMBClose(xid, pTcon, netfid);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+	kfree(buf);
+	if (rc == -EINVAL)
+		/* it's not a symlink */
+		return 0;
+	if (rc != 0)
+		return rc;
+
+	/* it is a symlink */
+	fattr->cf_eof = link_len;
+	fattr->cf_mode &= ~S_IFMT;
+	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+	fattr->cf_dtype = DT_LNK;
+	return 0;
+}
+
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	      struct dentry *direntry)
-- 
cgit v1.2.3


From 0fd43ae4758b2841656afda4439b80e8a3603af2 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 5 Aug 2010 21:13:44 +0200
Subject: cifs: implement CIFSQueryMFSymLink()

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/link.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index bec212b09a02..6e4e8957595d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -91,6 +91,56 @@ CIFSParseMFSymlink(const u8 *buf,
 	return 0;
 }
 
+static int
+CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
+		   const unsigned char *searchName, char **symlinkinfo,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	u8 *buf;
+	char *pbuf;
+	unsigned int bytes_read = 0;
+	int buf_type = CIFS_NO_BUFFER;
+	unsigned int link_len = 0;
+	FILE_ALL_INFO file_info;
+
+	rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+			 nls_codepage, remap);
+	if (rc != 0)
+		return rc;
+
+	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+		CIFSSMBClose(xid, tcon, netfid);
+		/* it's not a symlink */
+		return -EINVAL;
+	}
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	pbuf = buf;
+
+	rc = CIFSSMBRead(xid, tcon, netfid,
+			 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+			 0 /* offset */,
+			 &bytes_read, &pbuf, &buf_type);
+	CIFSSMBClose(xid, tcon, netfid);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
+	kfree(buf);
+	if (rc != 0)
+		return rc;
+
+	return 0;
+}
+
 bool
 CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
 {
-- 
cgit v1.2.3


From 18bddd1059c5d1e17ad6e49c514c95484aa80a33 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Tue, 3 Aug 2010 11:24:22 +0200
Subject: cifs: implement CIFSFormatMFSymlink()

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/link.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 6e4e8957595d..0473a86031a5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -92,6 +92,47 @@ CIFSParseMFSymlink(const u8 *buf,
 }
 
 static int
+CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+{
+	unsigned int link_len;
+	unsigned int ofs;
+	struct MD5Context md5_ctx;
+	u8 md5_hash[16];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	link_len = strlen(link_str);
+
+	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+		return -ENAMETOOLONG;
+
+	cifs_MD5_init(&md5_ctx);
+	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+	cifs_MD5_final(md5_hash, &md5_ctx);
+
+	snprintf(buf, buf_len,
+		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+		 link_len,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
+	memcpy(buf + ofs, link_str, link_len);
+
+	ofs += link_len;
+	if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = '\n';
+		ofs++;
+	}
+
+	while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = ' ';
+		ofs++;
+	}
+
+	return 0;
+}
+
 CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
 		   const unsigned char *searchName, char **symlinkinfo,
 		   const struct nls_table *nls_codepage, int remap)
-- 
cgit v1.2.3


From 8713d01db8bf948eb9632726f529ec4f821bb025 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 5 Aug 2010 21:15:22 +0200
Subject: cifs: implement CIFSCreateMFSymLink()

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/link.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 0473a86031a5..1b2a8c971995 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -133,6 +133,51 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 	return 0;
 }
 
+static int
+CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
+		    const char *fromName, const char *toName,
+		    const struct nls_table *nls_codepage, int remap)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	u8 *buf;
+	unsigned int bytes_written = 0;
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
+			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 nls_codepage, remap);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSSMBWrite(xid, tcon, netfid,
+			  CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+			  0 /* offset */,
+			  &bytes_written, buf, NULL, 0);
+	CIFSSMBClose(xid, tcon, netfid);
+	kfree(buf);
+	if (rc != 0)
+		return rc;
+
+	if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EIO;
+
+	return 0;
+}
+
+static int
 CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
 		   const unsigned char *searchName, char **symlinkinfo,
 		   const struct nls_table *nls_codepage, int remap)
-- 
cgit v1.2.3


From 1b12b9c15b4371d83b729b8fc18c670e78a1479b Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 5 Aug 2010 21:19:56 +0200
Subject: cifs: use Minshall+French symlink functions

If configured, Minshall+French Symlinks are used against
all servers. If the server supports UNIX Extensions,
we still create Minshall+French Symlinks on write,
but on read we fallback to UNIX Extension symlinks.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |  1 +
 fs/cifs/inode.c      | 14 ++++++++++++++
 fs/cifs/link.c       | 27 +++++++++++++++++++++++----
 fs/cifs/readdir.c    |  9 +++++++++
 4 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9e771450c3b8..7fde52969896 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -36,6 +36,7 @@
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 #define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
+#define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 93f77d438d3c..016975b8e6dd 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -332,6 +332,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 		return rc;
 	}
 
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+	}
+
 	if (*pinode == NULL) {
 		/* get new inode */
 		cifs_fill_uniqueid(sb, &fattr);
@@ -661,6 +668,13 @@ int cifs_get_inode_info(struct inode **pinode,
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
 		cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
 
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+	}
+
 	if (!*pinode) {
 		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 1b2a8c971995..cbf7b112287b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -407,7 +407,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	 * but there doesn't seem to be any harm in allowing the client to
 	 * read them.
 	 */
-	if (!(tcon->ses->capabilities & CAP_UNIX)) {
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+	    && !(tcon->ses->capabilities & CAP_UNIX)) {
 		rc = -EACCES;
 		goto out;
 	}
@@ -418,8 +419,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
 
-	rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-				     cifs_sb->local_nls);
+	rc = -EACCES;
+	/*
+	 * First try Minshall+French Symlinks, if configured
+	 * and fallback to UNIX Extensions Symlinks.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
+		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+					     cifs_sb->local_nls);
+
 	kfree(full_path);
 out:
 	if (rc != 0) {
@@ -459,7 +473,12 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	cFYI(1, "symname is %s", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
-	if (pTcon->unix_ext)
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+					 cifs_sb->local_nls,
+					 cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else if (pTcon->unix_ext)
 		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
 					   cifs_sb->local_nls);
 	/* else
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d5e591fab475..6a8b417babab 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -738,6 +738,15 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
 		cifs_autodisable_serverino(cifs_sb);
 	}
 
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
+	    CIFSCouldBeMFSymlink(&fattr))
+		/*
+		 * trying to get the type and mode can be slow,
+		 * so just call those regular files for now, and mark
+		 * for reval
+		 */
+		fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+
 	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
 	tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
 
-- 
cgit v1.2.3


From 736a33205969c16f81d747db14ff4c0f133609a6 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Fri, 30 Jul 2010 14:56:00 +0200
Subject: cifs: add "mfsymlinks" mount option

This is the start for an implementation of "Minshall+French Symlinks"
(see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks).

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README    |  5 +++++
 fs/cifs/cifsfs.c  |  2 ++
 fs/cifs/connect.c | 11 +++++++++++
 3 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index 7099a526f775..ee68d1036544 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -527,6 +527,11 @@ A partial list of the supported mount options follows:
 		SFU does).  In the future the bottom 9 bits of the
 		mode also will be emulated using queries of the security
 		descriptor (ACL).
+ mfsymlinks     Enable support for Minshall+French symlinks
+		(see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+		This option is ignored when specified together with the
+		'sfu' option. Minshall+French symlinks are used even if
+		the server supports the CIFS Unix Extensions.
  sign           Must use packet signing (helps avoid unwanted data modification
 		by intermediate systems in the route).  Note that signing
 		does not work with lanman or plaintext authentication.
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1b6ddd6f760f..52e89ea07458 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -441,6 +441,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 		seq_printf(s, ",dynperm");
 	if (m->mnt_sb->s_flags & MS_POSIXACL)
 		seq_printf(s, ",acl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		seq_printf(s, ",mfsymlinks");
 
 	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index fa884520fb84..435b912f5de5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -100,6 +100,7 @@ struct smb_vol {
 	bool noautotune:1;
 	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	bool fsc:1;	/* enable fscache */
+	bool mfsymlinks:1; /* use Minshall+French Symlinks */
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
@@ -1342,6 +1343,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 				"/proc/fs/cifs/LookupCacheEnabled to 0\n");
 		} else if (strnicmp(data, "fsc", 3) == 0) {
 			vol->fsc = true;
+		} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+			vol->mfsymlinks = true;
 		} else
 			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
 						data);
@@ -2554,6 +2557,14 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
+	if (pvolume_info->mfsymlinks) {
+		if (pvolume_info->sfu_emul) {
+			cERROR(1,  "mount option mfsymlinks ignored if sfu "
+				   "mount option is used");
+		} else {
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+		}
+	}
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
 		cERROR(1, "mount option dynperm ignored if cifsacl "
-- 
cgit v1.2.3


From 5fe97cfddc426f3145e8673b68faab7e54462173 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Sep 2010 16:01:30 -0700
Subject: cifs: add tcon field to cifsFileInfo struct

Eventually, we'll have more than one tcon per superblock. At that point,
we'll need to know which one is associated with a particular fid. For
now, this is just set from the cifs_sb->tcon pointer, but eventually
the caller of cifs_new_fileinfo will pass a tcon pointer in.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 1 +
 fs/cifs/dir.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6ef0efaf68d4..6f3968a5fea4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,6 +351,7 @@ struct cifsFileInfo {
 	struct file *pfile; /* needed for writepage */
 	struct inode *pInode; /* needed for oplock break */
 	struct vfsmount *mnt;
+	struct cifsTconInfo *tcon;
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
 	bool closePend:1;	/* file is marked to close */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 0f947bf73f8e..613589cf5172 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -150,6 +150,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
+	pCifsFile->tcon = cifs_sb->tcon;
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
-- 
cgit v1.2.3


From ab9db8b737210bec365593a04dd1c534220bb311 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 21 Sep 2010 08:14:46 -0700
Subject: cifs: allow matching of tcp sessions in CifsNew state

With commit 7332f2a6217ee6925f83ef0e725013067ed316ba, cifsd will no
longer exit when the socket abends and the tcpStatus is CifsNew. With
that change, there's no reason to avoid matching an existing session in
this state.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 435b912f5de5..271038b6ec0e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1511,15 +1511,6 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 
 	write_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-		/*
-		 * the demux thread can exit on its own while still in CifsNew
-		 * so don't accept any sockets in that state. Since the
-		 * tcpStatus never changes back to CifsNew it's safe to check
-		 * for this without a lock.
-		 */
-		if (server->tcpStatus == CifsNew)
-			continue;
-
 		if (!match_address(server, addr,
 				   (struct sockaddr *)&vol->srcaddr))
 			continue;
-- 
cgit v1.2.3


From 17edec6f563ba6b5630329fbe9473557bb475c3d Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 22 Sep 2010 02:01:59 +0000
Subject: [CIFS] Remove obsolete header

We decided not to use connector to do the upcalls so cn_cifs.h
is obsolete - remove it.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cn_cifs.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 fs/cifs/cn_cifs.h

(limited to 'fs')

diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb1..000000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *   fs/cifs/cn_cifs.h
- *
- *   Copyright (c) International Business Machines  Corp., 2002
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _CN_CIFS_H
-#define _CN_CIFS_H
-#ifdef CONFIG_CIFS_UPCALL
-#include <linux/types.h>
-#include <linux/connector.h>
-
-struct cifs_upcall {
-	char signature[4]; /* CIFS */
-	enum command {
-		CIFS_GET_IP = 0x00000001,   /* get ip address for hostname */
-		CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
-	} command;
-	/* union cifs upcall data follows */
-};
-#endif /* CIFS_UPCALL */
-#endif /* _CN_CIFS_H */
-- 
cgit v1.2.3


From d3bf5221d3274b5015ad18a55060b074cca8d2f0 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 22 Sep 2010 19:15:36 +0000
Subject: [CIFS] Fix ordering of cleanup on module init failure

If registering fs cache failed, we weren't cleaning up proc.

Acked-by: Jeff Layton <jlayton@redhat.com>
CC: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c  | 18 +++++++++---------
 fs/cifs/connect.c |  1 -
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 52e89ea07458..eeb8c67a3f29 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -933,11 +933,11 @@ init_cifs(void)
 
 	rc = cifs_fscache_register();
 	if (rc)
-		goto out;
+		goto out_clean_proc;
 
 	rc = cifs_init_inodecache();
 	if (rc)
-		goto out_clean_proc;
+		goto out_unreg_fscache;
 
 	rc = cifs_init_mids();
 	if (rc)
@@ -959,19 +959,19 @@ init_cifs(void)
 	return 0;
 
 #ifdef CONFIG_CIFS_UPCALL
- out_unregister_filesystem:
+out_unregister_filesystem:
 	unregister_filesystem(&cifs_fs_type);
 #endif
- out_destroy_request_bufs:
+out_destroy_request_bufs:
 	cifs_destroy_request_bufs();
- out_destroy_mids:
+out_destroy_mids:
 	cifs_destroy_mids();
- out_destroy_inodecache:
+out_destroy_inodecache:
 	cifs_destroy_inodecache();
- out_clean_proc:
-	cifs_proc_clean();
+out_unreg_fscache:
 	cifs_fscache_unregister();
- out:
+out_clean_proc:
+	cifs_proc_clean();
 	return rc;
 }
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 271038b6ec0e..230410e0a453 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,7 +47,6 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include "rfc1002pdu.h"
-#include "cn_cifs.h"
 #include "fscache.h"
 
 #define CIFS_PORT 445
-- 
cgit v1.2.3


From ba00ba64cf0895e4c2ac507e56306363dc125a90 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Sep 2010 16:01:31 -0700
Subject: cifs: make various routines use the cifsFileInfo->tcon pointer

...where it's available and appropriate.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c  |  4 ++--
 fs/cifs/file.c    | 39 ++++++++++++++++-----------------------
 fs/cifs/inode.c   | 22 ++++++++++++++++------
 fs/cifs/ioctl.c   | 17 +++--------------
 fs/cifs/readdir.c | 19 ++++++++-----------
 5 files changed, 45 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index eeb8c67a3f29..7193494efc03 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -589,6 +589,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 	/* note that this is called by vfs setlease with the BKL held
 	   although I doubt that BKL is needed here in cifs */
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifsFileInfo *cfile = file->private_data;
 
 	if (!(S_ISREG(inode->i_mode)))
 		return -EINVAL;
@@ -599,8 +600,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 	    ((arg == F_WRLCK) &&
 		(CIFS_I(inode)->clientCanCacheAll)))
 		return generic_setlease(file, arg, lease);
-	else if (CIFS_SB(inode->i_sb)->tcon->local_lease &&
-			!CIFS_I(inode)->clientCanCacheRead)
+	else if (cfile->tcon->local_lease && !CIFS_I(inode)->clientCanCacheRead)
 		/* If the server claims to support oplock on this
 		   file, then we still need to check oplock even
 		   if the local_lease mount option is set, but there
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 60061b9c2f67..84979fc77862 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -461,7 +461,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb->tcon;
+	tcon = pCifsFile->tcon;
 
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
@@ -575,7 +575,7 @@ int cifs_close(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = pSMBFile->tcon;
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
 		write_lock(&GlobalSMBSeslock);
@@ -653,11 +653,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	if (pCFileStruct) {
-		struct cifsTconInfo *pTcon;
-		struct cifs_sb_info *cifs_sb =
-			CIFS_SB(file->f_path.dentry->d_sb);
-
-		pTcon = cifs_sb->tcon;
+		struct cifsTconInfo *pTcon = pCFileStruct->tcon;
 
 		cFYI(1, "Freeing private data in close dir");
 		write_lock(&GlobalSMBSeslock);
@@ -767,7 +763,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 		cFYI(1, "Unknown type of lock");
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	tcon = cifs_sb->tcon;
+	tcon = ((struct cifsFileInfo *)file->private_data)->tcon;
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
@@ -960,14 +956,14 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
-	pTcon = cifs_sb->tcon;
-
 	/* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
 	   *poffset, file->f_path.dentry->d_name.name); */
 
 	if (file->private_data == NULL)
 		return -EBADF;
+
 	open_file = file->private_data;
+	pTcon = open_file->tcon;
 
 	rc = generic_write_checks(file, poffset, &write_size, 0);
 	if (rc)
@@ -1062,14 +1058,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
-	pTcon = cifs_sb->tcon;
-
 	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
 	   *poffset, file->f_path.dentry->d_name.name);
 
 	if (file->private_data == NULL)
 		return -EBADF;
 	open_file = file->private_data;
+	pTcon = open_file->tcon;
 
 	xid = GetXid();
 
@@ -1284,7 +1279,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	int rc = -EFAULT;
 	int bytes_written = 0;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifsFileInfo *open_file;
 
@@ -1293,7 +1287,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 
 	inode = page->mapping->host;
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
 
 	offset += (loff_t)from;
 	write_data = kmap(page);
@@ -1352,6 +1345,7 @@ static int cifs_writepages(struct address_space *mapping,
 	int nr_pages;
 	__u64 offset = 0;
 	struct cifsFileInfo *open_file;
+	struct cifsTconInfo *tcon;
 	struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
 	struct page *page;
 	struct pagevec pvec;
@@ -1501,8 +1495,9 @@ retry:
 				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
+				tcon = open_file->tcon;
 				long_op = cifs_write_timeout(cifsi, offset);
-				rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
+				rc = CIFSSMBWrite2(xid, tcon,
 						   open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
@@ -1520,8 +1515,7 @@ retry:
 					else
 						set_bit(AS_EIO, &mapping->flags);
 				} else {
-					cifs_stats_bytes_written(cifs_sb->tcon,
-								 bytes_written);
+					cifs_stats_bytes_written(tcon, bytes_written);
 				}
 			}
 			for (i = 0; i < n_iov; i++) {
@@ -1665,7 +1659,7 @@ int cifs_fsync(struct file *file, int datasync)
 	if (rc == 0) {
 		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
-		tcon = CIFS_SB(inode->i_sb)->tcon;
+		tcon = smbfile->tcon;
 		if (!rc && tcon && smbfile &&
 		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
 			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
@@ -1750,7 +1744,6 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
@@ -1758,6 +1751,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 		return rc;
 	}
 	open_file = file->private_data;
+	pTcon = open_file->tcon;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
@@ -1831,7 +1825,6 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
@@ -1839,6 +1832,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		return rc;
 	}
 	open_file = file->private_data;
+	pTcon = open_file->tcon;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
@@ -1974,7 +1968,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	}
 	open_file = file->private_data;
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = open_file->tcon;
 
 	/*
 	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2312,7 +2306,6 @@ void cifs_oplock_break(struct work_struct *work)
 						  oplock_break);
 	struct inode *inode = cfile->pInode;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
 	int rc, waitrc = 0;
 
 	if (inode && S_ISREG(inode->i_mode)) {
@@ -2339,7 +2332,7 @@ void cifs_oplock_break(struct work_struct *work)
 	 * disconnected since oplock already released by the server
 	 */
 	if (!cfile->closePend && !cfile->oplock_break_cancelled) {
-		rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
+		rc = CIFSSMBLock(0, cfile->tcon, cfile->netfid, 0, 0, 0, 0,
 				 LOCKING_ANDX_OPLOCK_RELEASE, false);
 		cFYI(1, "Oplock release rc = %d", rc);
 	}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 016975b8e6dd..0fa145596fcf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -288,8 +288,8 @@ int cifs_get_file_info_unix(struct file *filp)
 	struct cifs_fattr fattr;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
 	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifsTconInfo *tcon = cfile->tcon;
 
 	xid = GetXid();
 	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -522,8 +522,8 @@ int cifs_get_file_info(struct file *filp)
 	struct cifs_fattr fattr;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
 	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifsTconInfo *tcon = cfile->tcon;
 
 	xid = GetXid();
 	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -891,7 +891,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon;
 	FILE_BASIC_INFO	info_buf;
 
 	if (attrs == NULL)
@@ -934,9 +934,12 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	if (open_file) {
 		netfid = open_file->netfid;
 		netpid = open_file->pid;
+		pTcon = open_file->tcon;
 		goto set_via_filehandle;
 	}
 
+	pTcon = cifs_sb->tcon;
+
 	/*
 	 * NT4 apparently returns success on this call, but it doesn't
 	 * really work.
@@ -1611,11 +1614,12 @@ int cifs_revalidate_file(struct file *filp)
 {
 	int rc = 0;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
 
 	if (!cifs_inode_needs_reval(inode))
 		goto check_inval;
 
-	if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+	if (cfile->tcon->unix_ext)
 		rc = cifs_get_file_info_unix(filp);
 	else
 		rc = cifs_get_file_info(filp);
@@ -1720,7 +1724,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon = NULL;
 
 	/*
 	 * To avoid spurious oplock breaks from server, in the case of
@@ -1735,6 +1739,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	if (open_file) {
 		__u16 nfid = open_file->netfid;
 		__u32 npid = open_file->pid;
+		pTcon = open_file->tcon;
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
 		cifsFileInfo_put(open_file);
@@ -1749,6 +1754,9 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		rc = -EINVAL;
 
 	if (rc != 0) {
+		if (pTcon == NULL)
+			pTcon = cifs_sb->tcon;
+
 		/* Set file size by pathname rather than by handle
 		   either because no valid, writeable file handle for
 		   it was found or because there was an error setting
@@ -1798,7 +1806,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	struct inode *inode = direntry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon;
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
 
@@ -1889,9 +1897,11 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if (open_file) {
 		u16 nfid = open_file->netfid;
 		u32 npid = open_file->pid;
+		pTcon = open_file->tcon;
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
 		cifsFileInfo_put(open_file);
 	} else {
+		pTcon = cifs_sb->tcon;
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
 				    cifs_sb->local_nls,
 				    cifs_sb->mnt_cifs_flags &
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9d38a71c8e14..cc70a61a47d2 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,11 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	int xid;
 	struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
+	struct cifsFileInfo *pSMBFile = filep->private_data;
+	struct cifsTconInfo *tcon = pSMBFile->tcon;
 	__u64	ExtAttrBits = 0;
 	__u64	ExtAttrMask = 0;
-	__u64   caps;
-	struct cifsTconInfo *tcon;
-	struct cifsFileInfo *pSMBFile = filep->private_data;
+	__u64   caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
 #endif /* CONFIG_CIFS_POSIX */
 
 	xid = GetXid();
@@ -50,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
-#ifdef CONFIG_CIFS_POSIX
-	tcon = cifs_sb->tcon;
-	if (tcon)
-		caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-	else {
-		rc = -EIO;
-		FreeXid(xid);
-		return -EIO;
-	}
-#endif /* CONFIG_CIFS_POSIX */
-
 	switch (command) {
 		case CIFS_IOC_CHECKUMOUNT:
 			cFYI(1, "User unmount attempted");
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 6a8b417babab..d7784a95134f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -228,22 +228,21 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 
-	if (file->private_data == NULL) {
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	if (cifs_sb == NULL)
+		return -EINVAL;
+
+	if (file->private_data == NULL)
 		file->private_data =
 			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-	}
 
 	if (file->private_data == NULL)
 		return -ENOMEM;
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = true;
 	cifsFile->srch_inf.endOfSearch = false;
-
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EINVAL;
-
-	pTcon = cifs_sb->tcon;
+	cifsFile->tcon = cifs_sb->tcon;
+	pTcon = cifsFile->tcon;
 	if (pTcon == NULL)
 		return -EINVAL;
 
@@ -786,9 +785,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
-	if (pTcon == NULL)
-		return -EINVAL;
 
 	switch ((int) file->f_pos) {
 	case 0:
@@ -838,6 +834,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		} */
 
+		pTcon = cifsFile->tcon;
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
 		if (rc) {
-- 
cgit v1.2.3


From a6e8a8455c94565c53e1a1756d2ab9d9e3a902b8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Sep 2010 16:01:33 -0700
Subject: cifs: add function to get a tcon from cifs_sb

When we convert cifs to do multiple sessions per mount, we'll need more
than one tcon per superblock. At that point "cifs_sb->tcon" will make
no sense. Add a new accessor function that gets a tcon given a cifs_sb.
For now, it just returns cifs_sb->tcon. Later it'll do more.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c |  2 +-
 fs/cifs/cifsacl.c      | 16 ++++++++--------
 fs/cifs/cifsfs.c       | 11 +++--------
 fs/cifs/cifsglob.h     |  6 ++++++
 fs/cifs/connect.c      |  4 ++--
 fs/cifs/dir.c          | 20 ++++++++++----------
 fs/cifs/file.c         |  4 ++--
 fs/cifs/fscache.c      |  7 ++++---
 fs/cifs/inode.c        | 49 +++++++++++++++++++++++++------------------------
 fs/cifs/link.c         |  8 ++++----
 fs/cifs/misc.c         |  2 +-
 fs/cifs/readdir.c      |  8 ++++----
 fs/cifs/xattr.c        |  8 ++++----
 13 files changed, 74 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d6ced7aa23cf..f1e13ea45a17 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -316,7 +316,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	nd->path.dentry = dget(dentry);
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	ses = cifs_sb->tcon->ses;
+	ses = cifs_sb_tcon(cifs_sb)->ses;
 
 	if (!ses) {
 		rc = -EINVAL;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c8..32f244909a0d 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -559,7 +559,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 	int xid, rc;
 
 	xid = GetXid();
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, &pntsd, pacllen);
 	FreeXid(xid);
 
 
@@ -577,7 +577,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+	rc = CIFSSMBOpen(xid, cifs_sb_tcon(cifs_sb), path, FILE_OPEN, READ_CONTROL, 0,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
@@ -585,10 +585,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 		goto out;
 	}
 
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, &pntsd, pacllen);
 	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 
-	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+	CIFSSMBClose(xid, cifs_sb_tcon(cifs_sb), fid);
  out:
 	FreeXid(xid);
 	return pntsd;
@@ -618,7 +618,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
 	int xid, rc;
 
 	xid = GetXid();
-	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, pnntsd, acllen);
 	FreeXid(xid);
 
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
@@ -634,7 +634,7 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+	rc = CIFSSMBOpen(xid, cifs_sb_tcon(cifs_sb), path, FILE_OPEN, WRITE_DAC, 0,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
@@ -642,10 +642,10 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 		goto out;
 	}
 
-	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, pnntsd, acllen);
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 
-	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+	CIFSSMBClose(xid, cifs_sb_tcon(cifs_sb), fid);
  out:
 	FreeXid(xid);
 	return rc;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7193494efc03..b9624abb7261 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -137,9 +137,6 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
 	sb->s_bdi = &cifs_sb->bdi;
-/*	if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
-	    sb->s_blocksize =
-		cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
 	inode = cifs_root_iget(sb, ROOT_I);
@@ -225,7 +222,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
 	int rc = -EOPNOTSUPP;
 	int xid;
 
@@ -367,7 +364,7 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
@@ -458,9 +455,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	if (cifs_sb == NULL)
 		return;
 
-	tcon = cifs_sb->tcon;
-	if (tcon == NULL)
-		return;
+	tcon = cifs_sb_tcon(cifs_sb);
 
 	read_lock(&cifs_tcp_ses_lock);
 	if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6f3968a5fea4..c3510168438e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -413,6 +413,12 @@ CIFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+static inline struct cifsTconInfo *
+cifs_sb_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return cifs_sb->tcon;
+}
+
 static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 230410e0a453..c42d37fb5b7c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3026,8 +3026,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	int rc = 0;
 	char *tmp;
 
-	if (cifs_sb->tcon)
-		cifs_put_tcon(cifs_sb->tcon);
+	if (cifs_sb_tcon(cifs_sb))
+		cifs_put_tcon(cifs_sb_tcon(cifs_sb));
 
 	cifs_sb->tcon = NULL;
 	tmp = cifs_sb->prepath;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 613589cf5172..f660a15eb14f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -64,8 +64,8 @@ build_path_from_dentry(struct dentry *direntry)
 	cifs_sb = CIFS_SB(direntry->d_sb);
 	dirsep = CIFS_DIR_SEP(cifs_sb);
 	pplen = cifs_sb->prepathlen;
-	if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
-		dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+	if (cifs_sb_tcon(cifs_sb) && (cifs_sb_tcon(cifs_sb)->Flags & SMB_SHARE_IS_IN_DFS))
+		dfsplen = strnlen(cifs_sb_tcon(cifs_sb)->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
 	/* BB test paths to Windows with '/' in the midst of prepath */
 
 	if (dfsplen) {
-		strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+		strncpy(full_path, cifs_sb_tcon(cifs_sb)->treeName, dfsplen);
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
 			int i;
 			for (i = 0; i < dfsplen; i++) {
@@ -150,7 +150,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
-	pCifsFile->tcon = cifs_sb->tcon;
+	pCifsFile->tcon = cifs_sb_tcon(cifs_sb);
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
@@ -158,7 +158,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
 
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
+	list_add(&pCifsFile->tlist, &cifs_sb_tcon(cifs_sb)->openFileList);
 	pCifsInode = CIFS_I(newinode);
 	if (pCifsInode) {
 		/* if readable file instance put first in list*/
@@ -225,7 +225,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		posix_flags |= SMB_O_DIRECT;
 
 	mode &= ~current_umask();
-	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+	rc = CIFSPOSIXCreate(xid, cifs_sb_tcon(cifs_sb), posix_flags, mode,
 			pnetfid, presp_data, poplock, full_path,
 			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -298,7 +298,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb->tcon;
+	tcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -373,7 +373,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
-	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, create_options,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -504,7 +504,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -631,7 +631,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/* check whether path exists */
 
 	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	/*
 	 * Don't allow the separator character in a path component.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 84979fc77862..6712c2b0a0ae 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -235,7 +235,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb->tcon;
+	tcon = cifs_sb_tcon(cifs_sb);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 
@@ -345,7 +345,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 9f3f5c4be161..ec4318b019cc 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -66,11 +66,11 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 	if (cifsi->fscache)
 		return;
 
-	cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+	cifsi->fscache = fscache_acquire_cookie(cifs_sb_tcon(cifs_sb)->fscache,
 				&cifs_fscache_inode_object_def,
 				cifsi);
 	cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
-			cifs_sb->tcon->fscache, cifsi->fscache);
+			cifs_sb_tcon(cifs_sb)->fscache, cifsi->fscache);
 }
 
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -117,7 +117,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 		/* retire the current fscache cache and get a new one */
 		fscache_relinquish_cookie(cifsi->fscache, 1);
 
-		cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+		cifsi->fscache = fscache_acquire_cookie(
+					cifs_sb_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
 					cifsi);
 		cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0fa145596fcf..dce2d598927e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -52,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 
 
 		/* check if server can support readpages */
-		if (cifs_sb->tcon->ses->server->maxBuf <
+		if (cifs_sb_tcon(cifs_sb)->ses->server->maxBuf <
 				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 			inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
@@ -315,7 +315,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	struct cifsTconInfo *tcon;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	tcon = cifs_sb->tcon;
+	tcon = cifs_sb_tcon(cifs_sb);
+
 	cFYI(1, "Getting info on %s", full_path);
 
 	/* could have done a find first instead but this returns more info */
@@ -360,7 +361,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 	int rc;
 	int oplock = 0;
 	__u16 netfid;
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
 	char buf[24];
 	unsigned int bytes_read;
 	char *pbuf;
@@ -449,7 +450,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	char ea_value[4];
 	__u32 mode;
 
-	rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
+	rc = CIFSSMBQAllEAs(xid, cifs_sb_tcon(cifs_sb), path, "SETFILEBITS",
 			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
 			    cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -489,8 +490,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
 
 	if (adjust_tz) {
-		fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
-		fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
+		fattr->cf_ctime.tv_sec += cifs_sb_tcon(cifs_sb)->ses->server->timeAdj;
+		fattr->cf_mtime.tv_sec += cifs_sb_tcon(cifs_sb)->ses->server->timeAdj;
 	}
 
 	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -566,7 +567,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	bool adjustTZ = false;
 	struct cifs_fattr fattr;
 
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 	cFYI(1, "Getting info on %s", full_path);
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
@@ -706,8 +707,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 		return full_path;
 	}
 
-	if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
-		dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+	if (cifs_sb_tcon(cifs_sb) && (cifs_sb_tcon(cifs_sb)->Flags & SMB_SHARE_IS_IN_DFS))
+		dfsplen = strnlen(cifs_sb_tcon(cifs_sb)->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 
@@ -716,7 +717,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 		return full_path;
 
 	if (dfsplen) {
-		strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+		strncpy(full_path, cifs_sb_tcon(cifs_sb)->treeName, dfsplen);
 		/* switch slash direction in prepath depending on whether
 		 * windows or posix style path names
 		 */
@@ -841,7 +842,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 		return ERR_PTR(-ENOMEM);
 
 	xid = GetXid();
-	if (cifs_sb->tcon->unix_ext)
+	if (cifs_sb_tcon(cifs_sb)->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -852,10 +853,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 
 #ifdef CONFIG_CIFS_FSCACHE
 	/* populate tcon->resource_id */
-	cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
+	cifs_sb_tcon(cifs_sb)->resource_id = CIFS_I(inode)->uniqueid;
 #endif
 
-	if (rc && cifs_sb->tcon->ipc) {
+	if (rc && cifs_sb_tcon(cifs_sb)->ipc) {
 		cFYI(1, "ipc connection - fake read inode");
 		inode->i_mode |= S_IFDIR;
 		inode->i_nlink = 2;
@@ -938,7 +939,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 		goto set_via_filehandle;
 	}
 
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	/*
 	 * NT4 apparently returns success on this call, but it doesn't
@@ -1000,7 +1001,7 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
 	__u32 dosattr, origattr;
 	FILE_BASIC_INFO *info_buf = NULL;
 
@@ -1111,7 +1112,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct cifsInodeInfo *cifs_inode;
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
@@ -1213,7 +1214,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -1394,7 +1395,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -1435,7 +1436,7 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 		struct dentry *to_dentry, const char *toPath)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
 	__u16 srcfid;
 	int oplock, rc;
 
@@ -1486,7 +1487,7 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	int xid, rc, tmprc;
 
 	cifs_sb = CIFS_SB(source_dir->i_sb);
-	tcon = cifs_sb->tcon;
+	tcon = cifs_sb_tcon(cifs_sb);
 
 	xid = GetXid();
 
@@ -1660,7 +1661,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 		 "jiffies %ld", full_path, inode, inode->i_count.counter,
 		 dentry, dentry->d_time, jiffies);
 
-	if (CIFS_SB(sb)->tcon->unix_ext)
+	if (cifs_sb_tcon(CIFS_SB(sb))->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1755,7 +1756,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 
 	if (rc != 0) {
 		if (pTcon == NULL)
-			pTcon = cifs_sb->tcon;
+			pTcon = cifs_sb_tcon(cifs_sb);
 
 		/* Set file size by pathname rather than by handle
 		   either because no valid, writeable file handle for
@@ -1901,7 +1902,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
 		cifsFileInfo_put(open_file);
 	} else {
-		pTcon = cifs_sb->tcon;
+		pTcon = cifs_sb_tcon(cifs_sb);
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
 				    cifs_sb->local_nls,
 				    cifs_sb->mnt_cifs_flags &
@@ -2086,7 +2087,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
 
 	if (pTcon->unix_ext)
 		return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cbf7b112287b..66db2d61fa43 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -249,7 +249,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 	int rc;
 	int oplock = 0;
 	__u16 netfid = 0;
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
 	u8 *buf;
 	char *pbuf;
 	unsigned int bytes_read = 0;
@@ -321,7 +321,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	xid = GetXid();
 
 	cifs_sb_target = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_target->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb_target);
 
 /* No need to check for cross device links since server will do that
    BB note DFS case in future though (when we may have to check) */
@@ -390,7 +390,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	char *full_path = NULL;
 	char *target_path = NULL;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
 
 	xid = GetXid();
 
@@ -459,7 +459,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3ccadc1326d6..c5cbfdb2a58b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -729,6 +729,6 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 			   "properly. Hardlinks will not be recognized on this "
 			   "mount. Consider mounting with the \"noserverino\" "
 			   "option to silence this message.",
-			   cifs_sb->tcon->treeName);
+			   cifs_sb_tcon(cifs_sb)->treeName);
 	}
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d7784a95134f..3efc2424964f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 		return NULL;
 	}
 
-	if (CIFS_SB(sb)->tcon->nocase)
+	if (cifs_sb_tcon(CIFS_SB(sb))->nocase)
 		dentry->d_op = &cifs_ci_dentry_ops;
 	else
 		dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
 cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
 		       struct cifs_sb_info *cifs_sb)
 {
-	int offset = cifs_sb->tcon->ses->server->timeAdj;
+	int offset = cifs_sb_tcon(cifs_sb)->ses->server->timeAdj;
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +199,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 	int len;
 	int oplock = 0;
 	int rc;
-	struct cifsTconInfo *ptcon = cifs_sb->tcon;
+	struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
 	char *tmpbuffer;
 
 	rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -241,7 +241,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = true;
 	cifsFile->srch_inf.endOfSearch = false;
-	cifsFile->tcon = cifs_sb->tcon;
+	cifsFile->tcon = cifs_sb_tcon(cifs_sb);
 	pTcon = cifsFile->tcon;
 	if (pTcon == NULL)
 		return -EINVAL;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa6..41f95bf67977 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -61,7 +61,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -116,7 +116,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -224,7 +224,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -346,7 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 		return -EIO;
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = cifs_sb_tcon(cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3


From f6acb9d0596889a774e142ed76cb05b90d9763d2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Sep 2010 16:01:34 -0700
Subject: cifs: temporarily rename cifs_sb->tcon to ptcon to catch stragglers

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h | 2 +-
 fs/cifs/cifsglob.h   | 2 +-
 fs/cifs/connect.c    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7fde52969896..ba0afd3acff4 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -39,7 +39,7 @@
 #define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
 
 struct cifs_sb_info {
-	struct cifsTconInfo *tcon;	/* primary mount */
+	struct cifsTconInfo *ptcon;	/* primary mount */
 	struct list_head nested_tcon_q;
 	struct nls_table *local_nls;
 	unsigned int rsize;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c3510168438e..cc8300c741b6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -416,7 +416,7 @@ CIFS_SB(struct super_block *sb)
 static inline struct cifsTconInfo *
 cifs_sb_tcon(struct cifs_sb_info *cifs_sb)
 {
-	return cifs_sb->tcon;
+	return cifs_sb->ptcon;
 }
 
 static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c42d37fb5b7c..b4bacea54626 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2727,7 +2727,7 @@ try_mount_again:
 		goto remote_path_check;
 	}
 
-	cifs_sb->tcon = tcon;
+	cifs_sb->ptcon = tcon;
 
 	/* do not care if following two calls succeed - informational */
 	if (!tcon->ipc) {
@@ -3029,7 +3029,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	if (cifs_sb_tcon(cifs_sb))
 		cifs_put_tcon(cifs_sb_tcon(cifs_sb));
 
-	cifs_sb->tcon = NULL;
+	cifs_sb->ptcon = NULL;
 	tmp = cifs_sb->prepath;
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
-- 
cgit v1.2.3


From 0d424ad0a4b8c08e45928bccfa5b4b240097b01b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Sep 2010 16:01:35 -0700
Subject: cifs: add cifs_sb_master_tcon and convert some callers to use it

At mount time, we'll always need to create a tcon that will serve as a
template for others that are associated with the mount. This tcon is
known as the "master" tcon.

In some cases, we'll need to use that tcon regardless of who's accessing
the mount. Add an accessor function for the master tcon and go ahead and
switch the appropriate places to use it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |  6 +++---
 fs/cifs/cifsglob.h |  7 +++++++
 fs/cifs/connect.c  |  4 ++--
 fs/cifs/dir.c      | 10 +++++-----
 fs/cifs/fscache.c  | 12 ++++++------
 fs/cifs/inode.c    | 29 ++++++++++++++++-------------
 fs/cifs/misc.c     |  2 +-
 fs/cifs/readdir.c  |  4 ++--
 8 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b9624abb7261..898d2a5cfad2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -222,7 +222,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 	int rc = -EOPNOTSUPP;
 	int xid;
 
@@ -364,7 +364,7 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
@@ -455,7 +455,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	if (cifs_sb == NULL)
 		return;
 
-	tcon = cifs_sb_tcon(cifs_sb);
+	tcon = cifs_sb_master_tcon(cifs_sb);
 
 	read_lock(&cifs_tcp_ses_lock);
 	if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cc8300c741b6..c265ebdcd177 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -419,6 +419,13 @@ cifs_sb_tcon(struct cifs_sb_info *cifs_sb)
 	return cifs_sb->ptcon;
 }
 
+/* This function is always expected to succeed */
+static inline struct cifsTconInfo *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return cifs_sb->ptcon;
+}
+
 static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b4bacea54626..f6a3091c2874 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3025,9 +3025,9 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
 	char *tmp;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
-	if (cifs_sb_tcon(cifs_sb))
-		cifs_put_tcon(cifs_sb_tcon(cifs_sb));
+	cifs_put_tcon(tcon);
 
 	cifs_sb->ptcon = NULL;
 	tmp = cifs_sb->prepath;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f660a15eb14f..fe02435acb3c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
 	int dfsplen;
 	char *full_path;
 	char dirsep;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (direntry == NULL)
 		return NULL;  /* not much we can do if dentry is freed and
 		we need to reopen the file after it was closed implicitly
 		when the server crashed */
 
-	cifs_sb = CIFS_SB(direntry->d_sb);
 	dirsep = CIFS_DIR_SEP(cifs_sb);
 	pplen = cifs_sb->prepathlen;
-	if (cifs_sb_tcon(cifs_sb) && (cifs_sb_tcon(cifs_sb)->Flags & SMB_SHARE_IS_IN_DFS))
-		dfsplen = strnlen(cifs_sb_tcon(cifs_sb)->treeName, MAX_TREE_SIZE + 1);
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
 	/* BB test paths to Windows with '/' in the midst of prepath */
 
 	if (dfsplen) {
-		strncpy(full_path, cifs_sb_tcon(cifs_sb)->treeName, dfsplen);
+		strncpy(full_path, tcon->treeName, dfsplen);
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
 			int i;
 			for (i = 0; i < dfsplen; i++) {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index ec4318b019cc..a2ad94efcfe6 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -62,15 +62,15 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (cifsi->fscache)
 		return;
 
-	cifsi->fscache = fscache_acquire_cookie(cifs_sb_tcon(cifs_sb)->fscache,
-				&cifs_fscache_inode_object_def,
-				cifsi);
-	cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
-			cifs_sb_tcon(cifs_sb)->fscache, cifsi->fscache);
+	cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+				&cifs_fscache_inode_object_def, cifsi);
+	cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+				cifsi->fscache);
 }
 
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -118,7 +118,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 		fscache_relinquish_cookie(cifsi->fscache, 1);
 
 		cifsi->fscache = fscache_acquire_cookie(
-					cifs_sb_tcon(cifs_sb)->fscache,
+					cifs_sb_master_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
 					cifsi);
 		cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index dce2d598927e..da716d96dae6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -52,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 
 
 		/* check if server can support readpages */
-		if (cifs_sb_tcon(cifs_sb)->ses->server->maxBuf <
+		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
 				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 			inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
@@ -476,6 +476,8 @@ static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
 	if (info->DeletePending)
@@ -490,8 +492,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
 
 	if (adjust_tz) {
-		fattr->cf_ctime.tv_sec += cifs_sb_tcon(cifs_sb)->ses->server->timeAdj;
-		fattr->cf_mtime.tv_sec += cifs_sb_tcon(cifs_sb)->ses->server->timeAdj;
+		fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
+		fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
 	}
 
 	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -698,6 +700,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
 	char *full_path = NULL;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	/* if no prefix path, simply set path to the root of share to "" */
 	if (pplen == 0) {
@@ -707,8 +710,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 		return full_path;
 	}
 
-	if (cifs_sb_tcon(cifs_sb) && (cifs_sb_tcon(cifs_sb)->Flags & SMB_SHARE_IS_IN_DFS))
-		dfsplen = strnlen(cifs_sb_tcon(cifs_sb)->treeName, MAX_TREE_SIZE + 1);
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 
@@ -717,7 +720,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 		return full_path;
 
 	if (dfsplen) {
-		strncpy(full_path, cifs_sb_tcon(cifs_sb)->treeName, dfsplen);
+		strncpy(full_path, tcon->treeName, dfsplen);
 		/* switch slash direction in prepath depending on whether
 		 * windows or posix style path names
 		 */
@@ -831,18 +834,18 @@ retry_iget5_locked:
 struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
 	int xid;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct inode *inode = NULL;
 	long rc;
 	char *full_path;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
-	cifs_sb = CIFS_SB(sb);
 	full_path = cifs_build_path_to_root(cifs_sb);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
 	xid = GetXid();
-	if (cifs_sb_tcon(cifs_sb)->unix_ext)
+	if (tcon->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -853,10 +856,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 
 #ifdef CONFIG_CIFS_FSCACHE
 	/* populate tcon->resource_id */
-	cifs_sb_tcon(cifs_sb)->resource_id = CIFS_I(inode)->uniqueid;
+	tcon->resource_id = CIFS_I(inode)->uniqueid;
 #endif
 
-	if (rc && cifs_sb_tcon(cifs_sb)->ipc) {
+	if (rc && tcon->ipc) {
 		cFYI(1, "ipc connection - fake read inode");
 		inode->i_mode |= S_IFDIR;
 		inode->i_nlink = 2;
@@ -1661,7 +1664,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 		 "jiffies %ld", full_path, inode, inode->i_count.counter,
 		 dentry, dentry->d_time, jiffies);
 
-	if (cifs_sb_tcon(CIFS_SB(sb))->unix_ext)
+	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -2087,7 +2090,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
+	struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (pTcon->unix_ext)
 		return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c5cbfdb2a58b..252f2768db84 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -729,6 +729,6 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 			   "properly. Hardlinks will not be recognized on this "
 			   "mount. Consider mounting with the \"noserverino\" "
 			   "option to silence this message.",
-			   cifs_sb_tcon(cifs_sb)->treeName);
+			   cifs_sb_master_tcon(cifs_sb)->treeName);
 	}
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 3efc2424964f..887a7e230376 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 		return NULL;
 	}
 
-	if (cifs_sb_tcon(CIFS_SB(sb))->nocase)
+	if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
 		dentry->d_op = &cifs_ci_dentry_ops;
 	else
 		dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
 cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
 		       struct cifs_sb_info *cifs_sb)
 {
-	int offset = cifs_sb_tcon(cifs_sb)->ses->server->timeAdj;
+	int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
-- 
cgit v1.2.3


From f7a40689fd1e963cb1006349e050c07584895db5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Sep 2010 16:01:36 -0700
Subject: cifs: have cifs_new_fileinfo take a tcon arg

To minimize calls to cifs_sb_tcon and to allow for a clear error path if
a tcon can't be acquired.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  4 ++--
 fs/cifs/dir.c       | 22 ++++++++++++----------
 fs/cifs/file.c      |  4 ++--
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8604a45c1107..610b2263c9f3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -107,8 +107,8 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 
 extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
-				struct vfsmount *mnt, unsigned int oflags,
-				__u32 oplock);
+				struct vfsmount *mnt, struct cifsTconInfo *tcon,
+				unsigned int oflags, __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
 				int mode, int oflags,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index fe02435acb3c..23ec28a4c11f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -131,9 +131,9 @@ cifs_bp_rename_retry:
 }
 
 struct cifsFileInfo *
-cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
-		  struct file *file, struct vfsmount *mnt, unsigned int oflags,
-		  __u32 oplock)
+cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
+		  struct vfsmount *mnt, struct cifsTconInfo *tcon,
+		  unsigned int oflags, __u32 oplock)
 {
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
@@ -150,7 +150,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
-	pCifsFile->tcon = cifs_sb_tcon(cifs_sb);
+	pCifsFile->tcon = tcon;
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
@@ -158,7 +158,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
 
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &cifs_sb_tcon(cifs_sb)->openFileList);
+	list_add(&pCifsFile->tlist, &tcon->openFileList);
 	pCifsInode = CIFS_I(newinode);
 	if (pCifsInode) {
 		/* if readable file instance put first in list*/
@@ -191,6 +191,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	__u32 posix_flags = 0;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifs_fattr fattr;
+	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
 
 	cFYI(1, "posix open %s", full_path);
 
@@ -225,9 +226,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		posix_flags |= SMB_O_DIRECT;
 
 	mode &= ~current_umask();
-	rc = CIFSPOSIXCreate(xid, cifs_sb_tcon(cifs_sb), posix_flags, mode,
-			pnetfid, presp_data, poplock, full_path,
-			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
+			     poplock, full_path, cifs_sb->local_nls,
+			     cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc)
 		goto posix_open_ret;
@@ -466,7 +467,8 @@ cifs_create_set_dentry:
 		}
 
 		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-					       nd->path.mnt, oflags, oplock);
+					       nd->path.mnt, tcon, oflags,
+						oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
 			CIFSSMBClose(xid, tcon, fileHandle);
@@ -726,7 +728,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 			}
 
 			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
-						  nd->path.mnt,
+						  nd->path.mnt, pTcon,
 						  nd->intent.open.flags,
 						  oplock);
 			if (cfile == NULL) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6712c2b0a0ae..e5f463e15c9b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -277,7 +277,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
 							file->f_path.mnt,
-							oflags, oplock);
+							tcon, oflags, oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
 				rc = -ENOMEM;
@@ -370,7 +370,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 
 	pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
-					file->f_flags, oplock);
+					tcon, file->f_flags, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
-- 
cgit v1.2.3


From f3983c2133e9bea9c8b4f690737d15e3e9b02491 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 22 Sep 2010 16:17:40 -0700
Subject: cifs: fix handling of signing with writepages (try #6)

Get a reference to the file early so we can eventually base the decision
about signing on the correct tcon. If that doesn't work for some reason,
then fall back to generic_writepages. That's just as likely to fail, but
it simplifies the error handling.

In truth, I'm not sure how that could occur anyway, so maybe a NULL
open_file here ought to be a BUG()?

After that, we drop the reference to the open_file and then we re-get
one prior to each WriteAndX call. This helps ensure that the filehandle
isn't held open any longer than necessary and that open files are
reclaimed prior to each write call.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 68 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e5f463e15c9b..de046e183d12 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1353,6 +1353,15 @@ static int cifs_writepages(struct address_space *mapping,
 	int scanned = 0;
 	int xid, long_op;
 
+	/*
+	 * BB: Is this meaningful for a non-block-device file system?
+	 * If it is, we should test it again after we do I/O
+	 */
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
 	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
 	/*
@@ -1362,26 +1371,28 @@ static int cifs_writepages(struct address_space *mapping,
 	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
 		return generic_writepages(mapping, wbc);
 
-	if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
-		if (cifs_sb->tcon->ses->server->secMode &
-				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			if (!experimEnabled)
-				return generic_writepages(mapping, wbc);
-
 	iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
 	if (iov == NULL)
 		return generic_writepages(mapping, wbc);
 
-
 	/*
-	 * BB: Is this meaningful for a non-block-device file system?
-	 * If it is, we should test it again after we do I/O
+	 * if there's no open file, then this is likely to fail too,
+	 * but it'll at least handle the return. Maybe it should be
+	 * a BUG() instead?
 	 */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
+	open_file = find_writable_file(CIFS_I(mapping->host));
+	if (!open_file) {
 		kfree(iov);
-		return 0;
+		return generic_writepages(mapping, wbc);
+	}
+
+	tcon = open_file->tcon;
+	if (!experimEnabled && tcon->ses->server->secMode &
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		cifsFileInfo_put(open_file);
+		return generic_writepages(mapping, wbc);
 	}
+	cifsFileInfo_put(open_file);
 
 	xid = GetXid();
 
@@ -1486,38 +1497,33 @@ retry:
 				break;
 		}
 		if (n_iov) {
-			/* Search for a writable handle every time we call
-			 * CIFSSMBWrite2.  We can't rely on the last handle
-			 * we used to still be valid
-			 */
 			open_file = find_writable_file(CIFS_I(mapping->host));
 			if (!open_file) {
 				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
-				tcon = open_file->tcon;
 				long_op = cifs_write_timeout(cifsi, offset);
-				rc = CIFSSMBWrite2(xid, tcon,
-						   open_file->netfid,
+				rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
 						   long_op);
 				cifsFileInfo_put(open_file);
 				cifs_update_eof(cifsi, offset, bytes_written);
+			}
 
-				if (rc || bytes_written < bytes_to_write) {
-					cERROR(1, "Write2 ret %d, wrote %d",
-						  rc, bytes_written);
-					/* BB what if continued retry is
-					   requested via mount flags? */
-					if (rc == -ENOSPC)
-						set_bit(AS_ENOSPC, &mapping->flags);
-					else
-						set_bit(AS_EIO, &mapping->flags);
-				} else {
-					cifs_stats_bytes_written(tcon, bytes_written);
-				}
+			if (rc || bytes_written < bytes_to_write) {
+				cERROR(1, "Write2 ret %d, wrote %d",
+					  rc, bytes_written);
+				/* BB what if continued retry is
+				   requested via mount flags? */
+				if (rc == -ENOSPC)
+					set_bit(AS_ENOSPC, &mapping->flags);
+				else
+					set_bit(AS_EIO, &mapping->flags);
+			} else {
+				cifs_stats_bytes_written(tcon, bytes_written);
 			}
+
 			for (i = 0; i < n_iov; i++) {
 				page = pvec.pages[first + i];
 				/* Should we also set page error on
-- 
cgit v1.2.3


From aa510da5bfe1dfe263215fd0e05dac96e738a782 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 29 Sep 2010 15:11:56 -0400
Subject: NFS: We must use list_for_each_entry_safe in
 nfs_access_cache_shrinker

We may end up removing the current entry from nfs_access_lru_list.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 86f1d41c6078..fe549f5ef20f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1788,14 +1788,14 @@ static void nfs_access_free_list(struct list_head *head)
 int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(head);
-	struct nfs_inode *nfsi;
+	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
 
 	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
 		return (nr_to_scan == 0) ? 0 : -1;
 
 	spin_lock(&nfs_access_lru_lock);
-	list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
 		struct inode *inode;
 
 		if (nr_to_scan-- == 0)
-- 
cgit v1.2.3


From 46290341cd649c2bfb69e5067c1804c0395c83a1 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 30 Sep 2010 10:34:00 -0400
Subject: GFS2 fatal: filesystem consistency error on rename

This patch fixes a GFS2 problem whereby the first rename after a
mount can result in a file system consistency error being flagged
improperly and cause the file system to withdraw.  The problem is
that the rename code tries to run the rgrp list with function
gfs2_blk2rgrpd before the rgrp list is guaranteed to be read in
from disk.  The patch makes the rename function hold the rindex
glock (as the gfs2_unlink code does today) which reads in the rgrp
list if need be.  There were a total of three places in the rename
code that improperly referenced the rgrp list without the rindex
glock and this patch fixes all three.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c |  8 ++++++--
 fs/gfs2/rgrp.c      | 22 +++++++++++++---------
 fs/gfs2/rgrp.h      |  8 +++++---
 3 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index fba00171d915..0534510200d5 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -736,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -750,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
 
 	if (odip != ndip) {
 		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -879,7 +882,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 		al->al_requested = sdp->sd_max_dirres;
 
-		error = gfs2_inplace_reserve(ndip);
+		error = gfs2_inplace_reserve_ri(ndip);
 		if (error)
 			goto out_gunlock_q;
 
@@ -961,6 +964,7 @@ out_gunlock_r:
 	if (r_gh.gh_gl)
 		gfs2_glock_dq_uninit(&r_gh);
 out:
+	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f9ddcf401753..fb67f593f408 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1200,7 +1200,8 @@ out:
  * Returns: errno
  */
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+			   char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1211,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 		return -EINVAL;
 
 try_again:
-	/* We need to hold the rindex unless the inode we're using is
-	   the rindex itself, in which case it's already held. */
-	if (ip != GFS2_I(sdp->sd_rindex))
-		error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-	else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
-		error = gfs2_ri_update_special(ip);
+	if (hold_rindex) {
+		/* We need to hold the rindex unless the inode we're using is
+		   the rindex itself, in which case it's already held. */
+		if (ip != GFS2_I(sdp->sd_rindex))
+			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+		else if (!sdp->sd_rgrps) /* We may not have the rindex read
+					    in, so: */
+			error = gfs2_ri_update_special(ip);
+	}
 
 	if (error)
 		return error;
@@ -1227,7 +1231,7 @@ try_again:
 	   try to free it, and try the allocation again. */
 	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
 	if (error) {
-		if (ip != GFS2_I(sdp->sd_rindex))
+		if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
 		if (error != -EAGAIN)
 			return error;
@@ -1269,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 	al->al_rgd = NULL;
 	if (al->al_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_rgd_gh);
-	if (ip != GFS2_I(sdp->sd_rindex))
+	if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..0e35c0466f9a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-				  unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+				  char *file, unsigned int line);
 #define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+	gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+	gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-- 
cgit v1.2.3


From 249e6353001e407edf5c9a74482ecfca90c8ff33 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Oct 2010 05:41:27 +0200
Subject: hfsplus: fix BKL leak in hfsplus_ioctl

Currenly the HFSPLUS_IOC_EXT2_GETFLAGS case never unlocks the BKL, which
can lead to easily reproduced lockups when doing multiple GETFLAGS ioctls.

Fix this by only taking the BKL for the HFSPLUS_IOC_EXT2_SETFLAGS case
as neither HFSPLUS_IOC_EXT2_GETFLAGS not the default error case needs it.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..59dc402dfe95 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -26,7 +26,6 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	unsigned int flags;
 
-	lock_kernel();
 	switch (cmd) {
 	case HFSPLUS_IOC_EXT2_GETFLAGS:
 		flags = 0;
@@ -39,6 +38,8 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return put_user(flags, (int __user *)arg);
 	case HFSPLUS_IOC_EXT2_SETFLAGS: {
 		int err = 0;
+
+		lock_kernel();
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err) {
 			unlock_kernel();
@@ -93,7 +94,6 @@ setflags_out:
 		return err;
 	}
 	default:
-		unlock_kernel();
 		return -ENOTTY;
 	}
 }
-- 
cgit v1.2.3


From 94744567fef9602c3d8218a1d8f58c04cce354f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Oct 2010 05:41:31 +0200
Subject: hfsplus: split hfsplus_ioctl

Give each ioctl command a function of it's own.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/ioctl.c | 145 +++++++++++++++++++++++++++++------------------------
 1 file changed, 80 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 59dc402dfe95..0e359c3b55ee 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -21,78 +21,93 @@
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	unsigned int flags = 0;
+
+	if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
+		flags |= FS_APPEND_FL;
+	if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
+		flags |= FS_NODUMP_FL;
+
+	return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
 	unsigned int flags;
+	int err = 0;
 
-	switch (cmd) {
-	case HFSPLUS_IOC_EXT2_GETFLAGS:
-		flags = 0;
-		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-			flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
-		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-			flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
-		if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-			flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
-		return put_user(flags, (int __user *)arg);
-	case HFSPLUS_IOC_EXT2_SETFLAGS: {
-		int err = 0;
-
-		lock_kernel();
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err) {
-			unlock_kernel();
-			return err;
-		}
+	lock_kernel();
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out_unlock_kernel;
 
-		if (!is_owner_or_cap(inode)) {
-			err = -EACCES;
-			goto setflags_out;
-		}
-		if (get_user(flags, (int __user *)arg)) {
-			err = -EFAULT;
-			goto setflags_out;
-		}
-		if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-		    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				err = -EPERM;
-				goto setflags_out;
-			}
-		}
+	if (!is_owner_or_cap(inode)) {
+		err = -EACCES;
+		goto out_drop_write;
+	}
 
-		/* don't silently ignore unsupported ext2 flags */
-		if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
-			err = -EOPNOTSUPP;
-			goto setflags_out;
-		}
-		if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
-			inode->i_flags |= S_IMMUTABLE;
-			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
-		} else {
-			inode->i_flags &= ~S_IMMUTABLE;
-			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-		}
-		if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
-			inode->i_flags |= S_APPEND;
-			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
-		} else {
-			inode->i_flags &= ~S_APPEND;
-			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+	if (get_user(flags, user_flags)) {
+		err = -EFAULT;
+		goto out_drop_write;
+	}
+
+	if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
+	    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_drop_write;
 		}
-		if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
-			HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
-		else
-			HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-setflags_out:
-		mnt_drop_write(filp->f_path.mnt);
-		unlock_kernel();
-		return err;
 	}
+
+	/* don't silently ignore unsupported ext2 flags */
+	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+		err = -EOPNOTSUPP;
+		goto out_drop_write;
+	}
+	if (flags & FS_IMMUTABLE_FL) {
+		inode->i_flags |= S_IMMUTABLE;
+		HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
+	} else {
+		inode->i_flags &= ~S_IMMUTABLE;
+		HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
+	}
+	if (flags & FS_APPEND_FL) {
+		inode->i_flags |= S_APPEND;
+		HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
+	} else {
+		inode->i_flags &= ~S_APPEND;
+		HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+	}
+	if (flags & FS_NODUMP_FL)
+		HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
+	else
+		HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out_unlock_kernel:
+	unlock_kernel();
+	return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case HFSPLUS_IOC_EXT2_GETFLAGS:
+		return hfsplus_ioctl_getflags(file, argp);
+	case HFSPLUS_IOC_EXT2_SETFLAGS:
+		return hfsplus_ioctl_setflags(file, argp);
 	default:
 		return -ENOTTY;
 	}
-- 
cgit v1.2.3


From 6333816ade7e04a96ec0a34a8378c455e4f7c4dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Oct 2010 05:41:35 +0200
Subject: hfsplus: protect setflags using i_mutex

Use i_mutex for protecting against concurrent setflags ioctls like in
other filesystems and get rid of the BKL in hfsplus_ioctl.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/ioctl.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 0e359c3b55ee..906bd3dd3145 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,7 +17,6 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
@@ -42,10 +41,9 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	unsigned int flags;
 	int err = 0;
 
-	lock_kernel();
 	err = mnt_want_write(file->f_path.mnt);
 	if (err)
-		goto out_unlock_kernel;
+		goto out;
 
 	if (!is_owner_or_cap(inode)) {
 		err = -EACCES;
@@ -57,18 +55,20 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 		goto out_drop_write;
 	}
 
+	mutex_lock(&inode->i_mutex);
+
 	if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
 	    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
 			err = -EPERM;
-			goto out_drop_write;
+			goto out_unlock_inode;
 		}
 	}
 
 	/* don't silently ignore unsupported ext2 flags */
 	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
 		err = -EOPNOTSUPP;
-		goto out_drop_write;
+		goto out_unlock_inode;
 	}
 	if (flags & FS_IMMUTABLE_FL) {
 		inode->i_flags |= S_IMMUTABLE;
@@ -92,10 +92,11 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 
+out_unlock_inode:
+	mutex_lock(&inode->i_mutex);
 out_drop_write:
 	mnt_drop_write(file->f_path.mnt);
-out_unlock_kernel:
-	unlock_kernel();
+out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 40bf48afe92fcea61e7e164f0b2599fba8b88124 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Oct 2010 05:41:39 +0200
Subject: hfsplus: introduce alloc_mutex

Use a new per-sb alloc_mutex instead of abusing i_mutex of the alloc_file
to protect block allocations.  This gets rid of lockdep nesting warnings
and prepares for extending the scope of alloc_mutex.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bitmap.c     | 8 ++++----
 fs/hfsplus/hfsplus_fs.h | 3 +++
 fs/hfsplus/super.c      | 1 +
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..8a781769a901 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -29,7 +29,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		return size;
 
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	if (IS_ERR(page)) {
@@ -154,7 +154,7 @@ done:
 	sb->s_dirt = 1;
 	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_mutex);
 	return start;
 }
 
@@ -175,7 +175,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
 		return -2;
 
-	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
 	page = read_mapping_page(mapping, pnr, NULL);
@@ -226,7 +226,7 @@ out:
 	kunmap(page);
 	HFSPLUS_SB(sb).free_blocks += len;
 	sb->s_dirt = 1;
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_mutex);
 
 	return 0;
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..df0a6312f0f0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -116,6 +116,9 @@ struct hfsplus_sb_info {
 	struct inode *hidden_dir;
 	struct nls_table *nls;
 
+	/* synchronize block allocations */
+	struct mutex alloc_mutex;
+
 	/* Runtime variables */
 	u32 blockoffset;
 	u32 sect_count;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..3dc62aa58728 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -321,6 +321,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_fs_info = sbi;
 	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+	mutex_init(&sbi->alloc_mutex);
 	hfsplus_fill_defaults(sbi);
 	if (!hfsplus_parse_options(data, sbi)) {
 		printk(KERN_ERR "hfs: unable to parse mount options\n");
-- 
cgit v1.2.3


From a9fdbf8c6070d49c482e209df7ee93d9ec41ea27 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Oct 2010 05:41:50 +0200
Subject: hfsplus: use alloc_mutex in hfsplus_sync_fs

Use alloc_mutex to protect hfsplus_sync_fs against itself and concurrent
allocations, which allows to get rid of lock_super in hfsplus.

Note that most fields in the superblock still aren't protected against
concurrent allocations, that will follow later.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3dc62aa58728..4936642debaa 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -162,7 +162,7 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
-	lock_super(sb);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_mutex);
 	sb->s_dirt = 0;
 
 	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
@@ -195,7 +195,7 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 		}
 		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
-	unlock_super(sb);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_mutex);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e753a62156e952fd5a3c64f98454d9aeee3a2546 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Oct 2010 05:41:53 +0200
Subject: hfsplus: remove BKL from hfsplus_put_super

Except for ->put_super the BKL is now gone from HFS, which means it's
superflous there too as ->put_super is serialized by the VFS.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4936642debaa..a1d3fd920403 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/nls.h>
 
@@ -213,8 +212,6 @@ static void hfsplus_put_super(struct super_block *sb)
 	if (!sb->s_fs_info)
 		return;
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
@@ -235,8 +232,6 @@ static void hfsplus_put_super(struct super_block *sb)
 	unload_nls(HFSPLUS_SB(sb).nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
-- 
cgit v1.2.3


From dd73a01a30d729e8fa6f829c4582650e258e36f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:42:59 +0200
Subject: hfsplus: fix HFSPLUS_SB calling convention

HFSPLUS_SB doesn't return a pointer to the hfsplus-specific superblock
information like all other FOO_SB macros, but dereference the pointer in a way
that made it look like a direct struct derefence.  This only works as long
as the HFSPLUS_SB macro is used directly and prevents us from keepig a local
hfsplus_sb_info pointer.  Fix the calling convention and introduce a local
sbi variable in all functions that use it constantly.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bitmap.c     |  20 ++++---
 fs/hfsplus/btree.c      |   8 +--
 fs/hfsplus/catalog.c    |  25 ++++----
 fs/hfsplus/dir.c        |  40 +++++++------
 fs/hfsplus/extents.c    |  37 ++++++------
 fs/hfsplus/hfsplus_fs.h |   3 +-
 fs/hfsplus/inode.c      |  57 +++++++++---------
 fs/hfsplus/ioctl.c      |   4 +-
 fs/hfsplus/options.c    |   2 +-
 fs/hfsplus/part_tbl.c   |   5 +-
 fs/hfsplus/super.c      | 151 +++++++++++++++++++++++++-----------------------
 fs/hfsplus/unicode.c    |  16 ++---
 fs/hfsplus/wrapper.c    |  30 +++++-----
 13 files changed, 209 insertions(+), 189 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 8a781769a901..ad57f5991eb1 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
 
 int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
 	struct address_space *mapping;
 	__be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		return size;
 
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-	mutex_lock(&HFSPLUS_SB(sb).alloc_mutex);
-	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	if (IS_ERR(page)) {
 		start = size;
@@ -150,16 +151,17 @@ done:
 	set_page_dirty(page);
 	kunmap(page);
 	*max = offset + (curr - pptr) * 32 + i - start;
-	HFSPLUS_SB(sb).free_blocks -= *max;
+	sbi->free_blocks -= *max;
 	sb->s_dirt = 1;
 	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 	return start;
 }
 
 int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
 	struct address_space *mapping;
 	__be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 
 	dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
 	/* are all of the bits in range? */
-	if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+	if ((offset + count) > sbi->total_blocks)
 		return -2;
 
-	mutex_lock(&HFSPLUS_SB(sb).alloc_mutex);
-	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
 	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
 out:
 	set_page_dirty(page);
 	kunmap(page);
-	HFSPLUS_SB(sb).free_blocks += len;
+	sbi->free_blocks += len;
 	sb->s_dirt = 1;
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 
 	return 0;
 }
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..96e2128748f2 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -61,12 +61,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	if (id == HFSPLUS_EXT_CNID) {
 		tree->keycmp = hfsplus_ext_cmp_key;
 	} else if (id == HFSPLUS_CAT_CNID) {
-		if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+		if ((HFSPLUS_SB(sb)->flags & HFSPLUS_SB_HFSX) &&
 		    (head->key_type == HFSPLUS_KEY_BINARY))
 			tree->keycmp = hfsplus_cat_bin_cmp_key;
 		else {
 			tree->keycmp = hfsplus_cat_case_cmp_key;
-			HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+			HFSPLUS_SB(sb)->flags |= HFSPLUS_SB_CASEFOLD;
 		}
 	} else {
 		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
@@ -200,9 +200,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 			return ERR_PTR(res);
 		HFSPLUS_I(inode).phys_size = inode->i_size =
 				(loff_t)HFSPLUS_I(inode).alloc_blocks <<
-				HFSPLUS_SB(tree->sb).alloc_blksz_shift;
+				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
 		HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
-					     HFSPLUS_SB(tree->sb).fs_shift;
+					     HFSPLUS_SB(tree->sb)->fs_shift;
 		inode_set_bytes(inode, inode->i_size);
 		count = inode->i_size >> tree->node_size_shift;
 		tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..75ac1e466f1c 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -86,6 +86,8 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 
 static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
 	if (S_ISDIR(inode->i_mode)) {
 		struct hfsplus_cat_folder *folder;
 
@@ -99,7 +101,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			folder->attribute_mod_date =
 			folder->access_date = hfsp_now2mt();
 		hfsplus_set_perms(inode, &folder->permissions);
-		if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+		if (inode == sbi->hidden_dir)
 			/* invisible and namelocked */
 			folder->user_info.frFlags = cpu_to_be16(0x5000);
 		return sizeof(*folder);
@@ -122,8 +124,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 				file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
 				file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
 			} else {
-				file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
-				file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+				file->user_info.fdType = cpu_to_be32(sbi->type);
+				file->user_info.fdCreator = cpu_to_be32(sbi->creator);
 			}
 			if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 				file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,7 +133,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
 			file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
 			file->user_info.fdFlags = cpu_to_be16(0x100);
-			file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
+			file->create_date = HFSPLUS_I(sbi->hidden_dir).create_date;
 			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
 		}
 		return sizeof(*file);
@@ -180,15 +182,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
 
 int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
 {
+	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
-	struct super_block *sb;
 	hfsplus_cat_entry entry;
 	int entry_size;
 	int err;
 
 	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
-	sb = dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
 	entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +235,7 @@ err2:
 
 int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 {
-	struct super_block *sb;
+	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
 	struct hfsplus_fork_raw fork;
 	struct list_head *pos;
@@ -242,8 +243,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	u16 type;
 
 	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
-	sb = dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	if (!str) {
 		int len;
@@ -312,7 +312,7 @@ int hfsplus_rename_cat(u32 cnid,
 		       struct inode *src_dir, struct qstr *src_name,
 		       struct inode *dst_dir, struct qstr *dst_name)
 {
-	struct super_block *sb;
+	struct super_block *sb = src_dir->i_sb;
 	struct hfs_find_data src_fd, dst_fd;
 	hfsplus_cat_entry entry;
 	int entry_size, type;
@@ -320,8 +320,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
-	sb = src_dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
 	dst_fd = src_fd;
 
 	/* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..584777ddb0b9 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &hfsplus_dentry_operations;
 	dentry->d_fsdata = NULL;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
 again:
 	err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
 		cnid = be32_to_cpu(entry.file.id);
 		if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
 		    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
+		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir).create_date ||
 		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
-		    HFSPLUS_SB(sb).hidden_dir) {
+		    HFSPLUS_SB(sb)->hidden_dir) {
 			struct qstr str;
 			char name[32];
 
@@ -86,7 +86,8 @@ again:
 				linkid = be32_to_cpu(entry.file.permissions.dev);
 				str.len = sprintf(name, "iNode%d", linkid);
 				str.name = name;
-				hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+				hfsplus_cat_build_key(sb, fd.search_key,
+					HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
 				goto again;
 			}
 		} else if (!dentry->d_fsdata)
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (filp->f_pos >= inode->i_size)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
 	err = hfs_brec_find(&fd);
 	if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				err = -EIO;
 				goto out;
 			}
-			if (HFSPLUS_SB(sb).hidden_dir &&
-			    HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+			if (HFSPLUS_SB(sb)->hidden_dir &&
+			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+					be32_to_cpu(entry.folder.id))
 				goto next;
 			if (filldir(dirent, strbuf, len, filp->f_pos,
 				    be32_to_cpu(entry.folder.id), DT_DIR))
@@ -260,7 +262,7 @@ static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
 static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			struct dentry *dst_dentry)
 {
-	struct super_block *sb = dst_dir->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
 	struct inode *inode = src_dentry->d_inode;
 	struct inode *src_dir = src_dentry->d_parent->d_inode;
 	struct qstr str;
@@ -279,22 +281,22 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			str.len = sprintf(name, "iNode%d", id);
 			res = hfsplus_rename_cat(inode->i_ino,
 						 src_dir, &src_dentry->d_name,
-						 HFSPLUS_SB(sb).hidden_dir, &str);
+						 sbi->hidden_dir, &str);
 			if (!res)
 				break;
 			if (res != -EEXIST)
 				return res;
 		}
 		HFSPLUS_I(inode).dev = id;
-		cnid = HFSPLUS_SB(sb).next_cnid++;
+		cnid = sbi->next_cnid++;
 		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
 		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
 		if (res)
 			/* panic? */
 			return res;
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 	}
-	cnid = HFSPLUS_SB(sb).next_cnid++;
+	cnid = sbi->next_cnid++;
 	res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
 	if (res)
 		return res;
@@ -304,15 +306,15 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 	atomic_inc(&inode->i_count);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	HFSPLUS_SB(sb).file_count++;
-	sb->s_dirt = 1;
+	sbi->file_count++;
+	dst_dir->i_sb->s_dirt = 1;
 
 	return 0;
 }
 
 static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct super_block *sb = dir->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode = dentry->d_inode;
 	struct qstr str;
 	char name[32];
@@ -329,7 +331,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		str.len = sprintf(name, "temp%lu", inode->i_ino);
 		res = hfsplus_rename_cat(inode->i_ino,
 					 dir, &dentry->d_name,
-					 HFSPLUS_SB(sb).hidden_dir, &str);
+					 sbi->hidden_dir, &str);
 		if (!res)
 			inode->i_flags |= S_DEAD;
 		return res;
@@ -344,10 +346,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		clear_nlink(inode);
 	if (!inode->i_nlink) {
 		if (inode->i_ino != cnid) {
-			HFSPLUS_SB(sb).file_count--;
+			sbi->file_count--;
 			if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
 				res = hfsplus_delete_cat(inode->i_ino,
-							 HFSPLUS_SB(sb).hidden_dir,
+							 sbi->hidden_dir,
 							 NULL);
 				if (!res)
 					hfsplus_delete_inode(inode);
@@ -356,7 +358,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		} else
 			hfsplus_delete_inode(inode);
 	} else
-		HFSPLUS_SB(sb).file_count--;
+		sbi->file_count--;
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..181969814344 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -108,7 +108,7 @@ void hfsplus_ext_write_extent(struct inode *inode)
 	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
 		struct hfs_find_data fd;
 
-		hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 		__hfsplus_ext_write_extent(inode, &fd);
 		hfs_find_exit(&fd);
 	}
@@ -162,7 +162,7 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 	    block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 	res = __hfsplus_ext_cache_extent(&fd, inode, block);
 	hfs_find_exit(&fd);
 	return res;
@@ -172,16 +172,15 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		      struct buffer_head *bh_result, int create)
 {
-	struct super_block *sb;
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	int res = -EIO;
 	u32 ablock, dblock, mask;
 	int shift;
 
-	sb = inode->i_sb;
-
 	/* Convert inode block to disk allocation block */
-	shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
-	ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+	shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+	ablock = iblock >> sbi->fs_shift;
 
 	if (iblock >= HFSPLUS_I(inode).fs_blocks) {
 		if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
@@ -215,8 +214,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 
 done:
 	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
-	mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
-	map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+	mask = (1 << sbi->fs_shift) - 1;
+	map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
 	if (create) {
 		set_buffer_new(bh_result);
 		HFSPLUS_I(inode).phys_size += sb->s_blocksize;
@@ -327,7 +326,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 	if (total_blocks == blocks)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	do {
 		res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
 						total_blocks, type);
@@ -348,13 +347,16 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 int hfsplus_file_extend(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	u32 start, len, goal;
 	int res;
 
-	if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
 		// extend alloc file
-		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
-			HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+				sbi->alloc_file->i_size * 8,
+				sbi->total_blocks, sbi->free_blocks);
 		return -ENOSPC;
 	}
 
@@ -369,8 +371,8 @@ int hfsplus_file_extend(struct inode *inode)
 	}
 
 	len = HFSPLUS_I(inode).clump_blocks;
-	start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
-	if (start >= HFSPLUS_SB(sb).total_blocks) {
+	start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+	if (start >= sbi->total_blocks) {
 		start = hfsplus_block_allocate(sb, goal, 0, &len);
 		if (start >= goal) {
 			res = -ENOSPC;
@@ -463,13 +465,14 @@ void hfsplus_file_truncate(struct inode *inode)
 	} else if (inode->i_size == HFSPLUS_I(inode).phys_size)
 		return;
 
-	blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
+	blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+			HFSPLUS_SB(sb)->alloc_blksz_shift;
 	alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
 	mutex_lock(&HFSPLUS_I(inode).extents_lock);
-	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	while (1) {
 		if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
 			hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index df0a6312f0f0..55f42b49f0f1 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -375,17 +375,16 @@ int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 
 /* access macros */
-/*
 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
+/*
 static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 {
 	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
 }
 */
-#define HFSPLUS_SB(super)	(*(struct hfsplus_sb_info *)(super)->s_fs_info)
 #define HFSPLUS_I(inode)	(*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
 
 #if 1
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..050eee6c81bd 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 
 	switch (inode->i_ino) {
 	case HFSPLUS_EXT_CNID:
-		tree = HFSPLUS_SB(sb).ext_tree;
+		tree = HFSPLUS_SB(sb)->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
-		tree = HFSPLUS_SB(sb).cat_tree;
+		tree = HFSPLUS_SB(sb)->cat_tree;
 		break;
 	case HFSPLUS_ATTR_CNID:
-		tree = HFSPLUS_SB(sb).attr_tree;
+		tree = HFSPLUS_SB(sb)->attr_tree;
 		break;
 	default:
 		BUG();
@@ -190,7 +190,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
 
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
 	if (!err)
 		err = hfsplus_cat_read_inode(inode, &fd);
@@ -202,7 +202,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	HFSPLUS_I(inode).rsrc_inode = dir;
 	HFSPLUS_I(dir).rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb)->rsrc_inodes);
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
@@ -211,26 +211,24 @@ out:
 
 static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
 {
-	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	u16 mode;
 
 	mode = be16_to_cpu(perms->mode);
 
 	inode->i_uid = be32_to_cpu(perms->owner);
 	if (!inode->i_uid && !mode)
-		inode->i_uid = HFSPLUS_SB(sb).uid;
+		inode->i_uid = sbi->uid;
 
 	inode->i_gid = be32_to_cpu(perms->group);
 	if (!inode->i_gid && !mode)
-		inode->i_gid = HFSPLUS_SB(sb).gid;
+		inode->i_gid = sbi->gid;
 
 	if (dir) {
-		mode = mode ? (mode & S_IALLUGO) :
-			(S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
+		mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
 		mode |= S_IFDIR;
 	} else if (!mode)
-		mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
-			~(HFSPLUS_SB(sb).umask));
+		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
 	inode->i_mode = mode;
 
 	HFSPLUS_I(inode).rootflags = perms->rootflags;
@@ -282,7 +280,8 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
-			hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+			hfsplus_delete_cat(inode->i_ino,
+					   HFSPLUS_SB(sb)->hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
 		mutex_unlock(&inode->i_mutex);
@@ -361,11 +360,13 @@ static const struct file_operations hfsplus_file_operations = {
 
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct inode *inode = new_inode(sb);
+
 	if (!inode)
 		return NULL;
 
-	inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+	inode->i_ino = sbi->next_cnid++;
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
@@ -386,22 +387,22 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	HFSPLUS_I(inode).rsrc_inode = NULL;
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_size = 2;
-		HFSPLUS_SB(sb).folder_count++;
+		sbi->folder_count++;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (S_ISREG(inode->i_mode)) {
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 		inode->i_op = &hfsplus_file_inode_operations;
 		inode->i_fop = &hfsplus_file_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+		HFSPLUS_I(inode).clump_blocks = sbi->data_clump_blocks;
 	} else if (S_ISLNK(inode->i_mode)) {
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
 		HFSPLUS_I(inode).clump_blocks = 1;
 	} else
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 	sb->s_dirt = 1;
@@ -414,11 +415,11 @@ void hfsplus_delete_inode(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 
 	if (S_ISDIR(inode->i_mode)) {
-		HFSPLUS_SB(sb).folder_count--;
+		HFSPLUS_SB(sb)->folder_count--;
 		sb->s_dirt = 1;
 		return;
 	}
-	HFSPLUS_SB(sb).file_count--;
+	HFSPLUS_SB(sb)->file_count--;
 	if (S_ISREG(inode->i_mode)) {
 		if (!inode->i_nlink) {
 			inode->i_size = 0;
@@ -434,6 +435,7 @@ void hfsplus_delete_inode(struct inode *inode)
 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	u32 count;
 	int i;
 
@@ -450,10 +452,13 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 	inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
 	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
-	HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_I(inode).clump_blocks)
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
-				HFSPLUS_SB(sb).data_clump_blocks;
+	HFSPLUS_I(inode).clump_blocks =
+		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+	if (!HFSPLUS_I(inode).clump_blocks) {
+		HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+			sbi->rsrc_clump_blocks :
+			sbi->data_clump_blocks;
+	}
 }
 
 void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
@@ -538,7 +543,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 	if (!main_inode->i_nlink)
 		return 0;
 
-	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
 		/* panic? */
 		return -EIO;
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 906bd3dd3145..66dd64b88b2e 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -126,7 +126,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
 		return -EOPNOTSUPP;
 
-	res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 	if (res)
 		return res;
 	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -169,7 +169,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 		return -EOPNOTSUPP;
 
 	if (size) {
-		res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 		if (res)
 			return res;
 		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..f2598aef206e 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -171,7 +171,7 @@ done:
 
 int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 {
-	struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
 		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..208b16c645cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
 int hfs_part_find(struct super_block *sb,
 		  sector_t *part_start, sector_t *part_size)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct buffer_head *bh;
 	__be16 *data;
 	int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
 		for (i = 0; i < size; p++, i++) {
 			if (p->pdStart && p->pdSize &&
 			    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-			    (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+			    (sbi->part < 0 || sbi->part == i)) {
 				*part_start += be32_to_cpu(p->pdStart);
 				*part_size = be32_to_cpu(p->pdSize);
 				res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
 		size = be32_to_cpu(pm->pmMapBlkCnt);
 		for (i = 0; i < size;) {
 			if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-			    (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+			    (sbi->part < 0 || sbi->part == i)) {
 				*part_start += be32_to_cpu(pm->pmPyPartStart);
 				*part_size = be32_to_cpu(pm->pmPartBlkCnt);
 				res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index a1d3fd920403..1bf00b9ecc1a 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -41,7 +41,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 
 	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
 	read_inode:
-		hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
 		if (!err)
 			err = hfsplus_cat_read_inode(inode, &fd);
@@ -50,7 +50,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 			goto bad_inode;
 		goto done;
 	}
-	vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
+	vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
 	switch(inode->i_ino) {
 	case HFSPLUS_ROOT_CNID:
 		goto read_inode;
@@ -89,6 +89,7 @@ bad_inode:
 static int hfsplus_write_inode(struct inode *inode,
 		struct writeback_control *wbc)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	struct hfsplus_vh *vhdr;
 	int ret = 0;
 
@@ -97,48 +98,48 @@ static int hfsplus_write_inode(struct inode *inode,
 	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
 		return hfsplus_cat_write_inode(inode);
 	}
-	vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
+	vhdr = sbi->s_vhdr;
 	switch (inode->i_ino) {
 	case HFSPLUS_ROOT_CNID:
 		ret = hfsplus_cat_write_inode(inode);
 		break;
 	case HFSPLUS_EXT_CNID:
 		if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
 			inode->i_sb->s_dirt = 1;
 		}
 		hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
+		hfs_btree_write(sbi->ext_tree);
 		break;
 	case HFSPLUS_CAT_CNID:
 		if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
 			inode->i_sb->s_dirt = 1;
 		}
 		hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
+		hfs_btree_write(sbi->cat_tree);
 		break;
 	case HFSPLUS_ALLOC_CNID:
 		if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
 			inode->i_sb->s_dirt = 1;
 		}
 		hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
 		break;
 	case HFSPLUS_START_CNID:
 		if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
 			inode->i_sb->s_dirt = 1;
 		}
 		hfsplus_inode_write_fork(inode, &vhdr->start_file);
 		break;
 	case HFSPLUS_ATTR_CNID:
 		if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
+			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
 			inode->i_sb->s_dirt = 1;
 		}
 		hfsplus_inode_write_fork(inode, &vhdr->attr_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
+		hfs_btree_write(sbi->attr_tree);
 		break;
 	}
 	return ret;
@@ -157,44 +158,46 @@ static void hfsplus_evict_inode(struct inode *inode)
 
 int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
-	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
-	mutex_lock(&HFSPLUS_SB(sb).alloc_mutex);
+	mutex_lock(&sbi->alloc_mutex);
 	sb->s_dirt = 0;
 
-	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
-	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
-	vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
-	vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
-	vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+	vhdr->next_alloc = cpu_to_be32(sbi->next_alloc);
+	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+	vhdr->file_count = cpu_to_be32(sbi->file_count);
 
-	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
-		if (HFSPLUS_SB(sb).sect_count) {
+	mark_buffer_dirty(sbi->s_vhbh);
+	if (sbi->flags & HFSPLUS_SB_WRITEBACKUP) {
+		if (sbi->sect_count) {
 			struct buffer_head *bh;
 			u32 block, offset;
 
-			block = HFSPLUS_SB(sb).blockoffset;
-			block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
-			offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
-			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
-				HFSPLUS_SB(sb).sect_count, block, offset);
+			block = sbi->blockoffset;
+			block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
+			offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
+			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
+					  sbi->blockoffset, sbi->sect_count,
+					  block, offset);
 			bh = sb_bread(sb, block);
 			if (bh) {
 				vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
 				if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-					memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+					memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
 					mark_buffer_dirty(bh);
 					brelse(bh);
 				} else
 					printk(KERN_WARNING "hfs: backup not found!\n");
 			}
 		}
-		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
+		sbi->flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 	return 0;
 }
 
@@ -208,28 +211,31 @@ static void hfsplus_write_super(struct super_block *sb)
 
 static void hfsplus_put_super(struct super_block *sb)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
+
 	if (!sb->s_fs_info)
 		return;
 
 	if (sb->s_dirt)
 		hfsplus_write_super(sb);
-	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
-		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+	if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+		struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 		vhdr->modify_date = hfsp_now2mt();
 		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
 		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-		mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-		sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+		mark_buffer_dirty(sbi->s_vhbh);
+		sync_dirty_buffer(sbi->s_vhbh);
 	}
 
-	hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
-	hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
-	iput(HFSPLUS_SB(sb).alloc_file);
-	iput(HFSPLUS_SB(sb).hidden_dir);
-	brelse(HFSPLUS_SB(sb).s_vhbh);
-	unload_nls(HFSPLUS_SB(sb).nls);
+	hfs_btree_close(sbi->cat_tree);
+	hfs_btree_close(sbi->ext_tree);
+	iput(sbi->alloc_file);
+	iput(sbi->hidden_dir);
+	brelse(sbi->s_vhbh);
+	unload_nls(sbi->nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 }
@@ -237,15 +243,16 @@ static void hfsplus_put_super(struct super_block *sb)
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
-	buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+	buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+	buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = 0xFFFFFFFF;
-	buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+	buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -258,11 +265,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (!(*flags & MS_RDONLY)) {
-		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
 		struct hfsplus_sb_info sbi;
 
 		memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
-		sbi.nls = HFSPLUS_SB(sb).nls;
+		sbi.nls = HFSPLUS_SB(sb)->nls;
 		if (!hfsplus_parse_options(data, &sbi))
 			return -EINVAL;
 
@@ -340,7 +347,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = -EINVAL;
 		goto cleanup;
 	}
-	vhdr = HFSPLUS_SB(sb).s_vhdr;
+	vhdr = sbi->s_vhdr;
 
 	/* Copy parts of the volume header into the superblock */
 	sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -349,18 +356,20 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "hfs: wrong filesystem version\n");
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
-	HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
-	HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
-	HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
-	HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
-	HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
-	HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_SB(sb).data_clump_blocks)
-		HFSPLUS_SB(sb).data_clump_blocks = 1;
-	HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
-		HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+	sbi->next_alloc = be32_to_cpu(vhdr->next_alloc);
+	sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+	sbi->file_count = be32_to_cpu(vhdr->file_count);
+	sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+	sbi->data_clump_blocks =
+		be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->data_clump_blocks)
+		sbi->data_clump_blocks = 1;
+	sbi->rsrc_clump_blocks =
+		be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->rsrc_clump_blocks)
+		sbi->rsrc_clump_blocks = 1;
 
 	/* Set up operations so we can load metadata */
 	sb->s_op = &hfsplus_sops;
@@ -383,13 +392,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->flags &= ~HFSPLUS_SB_FORCE;
 
 	/* Load metadata objects (B*Trees) */
-	HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
-	if (!HFSPLUS_SB(sb).ext_tree) {
+	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+	if (!sbi->ext_tree) {
 		printk(KERN_ERR "hfs: failed to load extents file\n");
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
-	if (!HFSPLUS_SB(sb).cat_tree) {
+	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+	if (!sbi->cat_tree) {
 		printk(KERN_ERR "hfs: failed to load catalog file\n");
 		goto cleanup;
 	}
@@ -400,7 +409,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = PTR_ERR(inode);
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).alloc_file = inode;
+	sbi->alloc_file = inode;
 
 	/* Load the root directory */
 	root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -419,7 +428,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
 	str.name = HFSP_HIDDENDIR_NAME;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(sbi->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
@@ -430,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 			err = PTR_ERR(inode);
 			goto cleanup;
 		}
-		HFSPLUS_SB(sb).hidden_dir = inode;
+		sbi->hidden_dir = inode;
 	} else
 		hfs_find_exit(&fd);
 
@@ -445,15 +454,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	be32_add_cpu(&vhdr->write_count, 1);
 	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
 	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+	mark_buffer_dirty(sbi->s_vhbh);
+	sync_dirty_buffer(sbi->s_vhbh);
 
-	if (!HFSPLUS_SB(sb).hidden_dir) {
+	if (!sbi->hidden_dir) {
 		printk(KERN_DEBUG "hfs: create hidden dir...\n");
-		HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-		hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
-				   &str, HFSPLUS_SB(sb).hidden_dir);
-		mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
+		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
+				   &str, sbi->hidden_dir);
+		mark_inode_dirty(sbi->hidden_dir);
 	}
 out:
 	unload_nls(sbi->nls);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..a15fcffce74f 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
 int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
 {
 	const hfsplus_unichr *ip;
-	struct nls_table *nls = HFSPLUS_SB(sb).nls;
+	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
 	u8 *op;
 	u16 cc, c0, c1;
 	u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
 	ustrlen = be16_to_cpu(ustr->length);
 	len = *len_p;
 	ce1 = NULL;
-	compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	compose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
 
 	while (ustrlen > 0) {
 		c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
 static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
 			      wchar_t *uc)
 {
-	int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
 	if (size <= 0) {
 		*uc = '?';
 		size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 	u16 *dstr, outlen = 0;
 	wchar_t c;
 
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	decompose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
 	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
 		size = asc2unichar(sb, astr, len, &c);
 
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 	wchar_t c;
 	u16 c2;
 
-	casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = (HFSPLUS_SB(sb)->flags & HFSPLUS_SB_CASEFOLD);
+	decompose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
 	hash = init_name_hash();
 	astr = str->name;
 	len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 	u16 c1, c2;
 	wchar_t c;
 
-	casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = (HFSPLUS_SB(sb)->flags & HFSPLUS_SB_CASEFOLD);
+	decompose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
 	astr1 = s1->name;
 	len1 = s1->len;
 	astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..c89e198e8a2a 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
 	*start = 0;
 	*size = sb->s_bdev->bd_inode->i_size >> 9;
 
-	if (HFSPLUS_SB(sb).session >= 0) {
-		te.cdte_track = HFSPLUS_SB(sb).session;
+	if (HFSPLUS_SB(sb)->session >= 0) {
+		te.cdte_track = HFSPLUS_SB(sb)->session;
 		te.cdte_format = CDROM_LBA;
 		res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
 		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
 /* Takes in super block, returns true if good data read */
 int hfsplus_read_wrapper(struct super_block *sb)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct buffer_head *bh;
 	struct hfsplus_vh *vhdr;
 	struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
 			break;
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-			HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
+			sbi->flags |= HFSPLUS_SB_HFSX;
 			break;
 		}
 		brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
 	if (blocksize < HFSPLUS_SECTOR_SIZE ||
 	    ((blocksize - 1) & blocksize))
 		return -EINVAL;
-	HFSPLUS_SB(sb).alloc_blksz = blocksize;
-	HFSPLUS_SB(sb).alloc_blksz_shift = 0;
+	sbi->alloc_blksz = blocksize;
+	sbi->alloc_blksz_shift = 0;
 	while ((blocksize >>= 1) != 0)
-		HFSPLUS_SB(sb).alloc_blksz_shift++;
-	blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+		sbi->alloc_blksz_shift++;
+	blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
 
 	/* align block size to block offset */
 	while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,22 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		return -EINVAL;
 	}
 
-	HFSPLUS_SB(sb).blockoffset = part_start >>
-			(sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
-	HFSPLUS_SB(sb).sect_count = part_size;
-	HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
-			sb->s_blocksize_bits;
+	sbi->blockoffset =
+		part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+	sbi->sect_count = part_size;
+	sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
 
 	bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
 	if (!bh)
 		return -EIO;
 
 	/* should still be the same... */
-	if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
+	if (vhdr->signature != (sbi->flags & HFSPLUS_SB_HFSX ?
 				cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
 				cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
 		goto error;
-	HFSPLUS_SB(sb).s_vhbh = bh;
-	HFSPLUS_SB(sb).s_vhdr = vhdr;
+	sbi->s_vhbh = bh;
+	sbi->s_vhdr = vhdr;
 
 	return 0;
  error:
-- 
cgit v1.2.3


From 6af502de224c3742936d54eee7e3690c09822934 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:43:31 +0200
Subject: hfsplus: fix HFSPLUS_I calling convention

HFSPLUS_I doesn't return a pointer to the hfsplus-specific inode
information like all other FOO_I macros, but dereference the pointer in a way
that made it look like a direct struct derefence.  This only works as long
as the HFSPLUS_I macro is used directly and prevents us from keepig a local
hfsplus_inode_info pointer.  Fix the calling convention and introduce a local
hip variable in all functions that use it constantly.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/btree.c      |   9 +--
 fs/hfsplus/catalog.c    |  14 ++--
 fs/hfsplus/dir.c        |  14 ++--
 fs/hfsplus/extents.c    | 171 ++++++++++++++++++++++++++----------------------
 fs/hfsplus/hfsplus_fs.h |   8 +--
 fs/hfsplus/inode.c      | 112 ++++++++++++++++---------------
 fs/hfsplus/ioctl.c      |  22 ++++---
 fs/hfsplus/super.c      |  16 ++---
 8 files changed, 193 insertions(+), 173 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 96e2128748f2..f75cf222b8b4 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -192,17 +192,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 
 	while (!tree->free_nodes) {
 		struct inode *inode = tree->inode;
+		struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 		u32 count;
 		int res;
 
 		res = hfsplus_file_extend(inode);
 		if (res)
 			return ERR_PTR(res);
-		HFSPLUS_I(inode).phys_size = inode->i_size =
-				(loff_t)HFSPLUS_I(inode).alloc_blocks <<
+		hip->phys_size = inode->i_size =
+			(loff_t)hip->alloc_blocks <<
 				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
-		HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
-					     HFSPLUS_SB(tree->sb)->fs_shift;
+		hip->fs_blocks =
+			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
 		inode_set_bytes(inode, inode->i_size);
 		count = inode->i_size >> tree->node_size_shift;
 		tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 75ac1e466f1c..48979c4e8fa5 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -77,8 +77,8 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->rootflags |= HFSPLUS_FLG_APPEND;
 	else
 		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	HFSPLUS_I(inode).rootflags = perms->rootflags;
-	HFSPLUS_I(inode).userflags = perms->userflags;
+	HFSPLUS_I(inode)->rootflags = perms->rootflags;
+	HFSPLUS_I(inode)->userflags = perms->userflags;
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
 	perms->group = cpu_to_be32(inode->i_gid);
@@ -95,7 +95,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		memset(folder, 0, sizeof(*folder));
 		folder->type = cpu_to_be16(HFSPLUS_FOLDER);
 		folder->id = cpu_to_be32(inode->i_ino);
-		HFSPLUS_I(inode).create_date =
+		HFSPLUS_I(inode)->create_date =
 			folder->create_date =
 			folder->content_mod_date =
 			folder->attribute_mod_date =
@@ -113,7 +113,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		file->type = cpu_to_be16(HFSPLUS_FILE);
 		file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
 		file->id = cpu_to_be32(cnid);
-		HFSPLUS_I(inode).create_date =
+		HFSPLUS_I(inode)->create_date =
 			file->create_date =
 			file->content_mod_date =
 			file->attribute_mod_date =
@@ -133,8 +133,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
 			file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
 			file->user_info.fdFlags = cpu_to_be16(0x100);
-			file->create_date = HFSPLUS_I(sbi->hidden_dir).create_date;
-			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+			file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->dev);
 		}
 		return sizeof(*file);
 	}
@@ -279,7 +279,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
 	}
 
-	list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+	list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
 		struct hfsplus_readdir_data *rd =
 			list_entry(pos, struct hfsplus_readdir_data, list);
 		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 584777ddb0b9..5ca8308f9eb7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -68,8 +68,8 @@ again:
 		cnid = be32_to_cpu(entry.file.id);
 		if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
 		    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir).create_date ||
-		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
+		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
 		    HFSPLUS_SB(sb)->hidden_dir) {
 			struct qstr str;
 			char name[32];
@@ -102,7 +102,7 @@ again:
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (S_ISREG(inode->i_mode))
-		HFSPLUS_I(inode).dev = linkid;
+		HFSPLUS_I(inode)->dev = linkid;
 out:
 	d_add(dentry, inode);
 	return NULL;
@@ -219,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		filp->private_data = rd;
 		rd->file = filp;
-		list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
 out:
@@ -287,7 +287,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			if (res != -EEXIST)
 				return res;
 		}
-		HFSPLUS_I(inode).dev = id;
+		HFSPLUS_I(inode)->dev = id;
 		cnid = sbi->next_cnid++;
 		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
 		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
@@ -326,7 +326,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 
 	cnid = (u32)(unsigned long)dentry->d_fsdata;
 	if (inode->i_ino == cnid &&
-	    atomic_read(&HFSPLUS_I(inode).opencnt)) {
+	    atomic_read(&HFSPLUS_I(inode)->opencnt)) {
 		str.name = name;
 		str.len = sprintf(name, "temp%lu", inode->i_ino);
 		res = hfsplus_rename_cat(inode->i_ino,
@@ -347,7 +347,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 	if (!inode->i_nlink) {
 		if (inode->i_ino != cnid) {
 			sbi->file_count--;
-			if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+			if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
 				res = hfsplus_delete_cat(inode->i_ino,
 							 sbi->hidden_dir,
 							 NULL);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 181969814344..b1017eaa4fdc 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,27 +85,32 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
 
 static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
-	hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
-			      HFSPLUS_IS_RSRC(inode) ?  HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+	hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+			      HFSPLUS_IS_RSRC(inode) ?
+				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
 	res = hfs_brec_find(fd);
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+	if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
 		if (res != -ENOENT)
 			return;
-		hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hfs_brec_insert(fd, hip->cached_extents,
+				sizeof(hfsplus_extent_rec));
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 	} else {
 		if (res)
 			return;
-		hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
-		HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+		hfs_bnode_write(fd->bnode, hip->cached_extents,
+				fd->entryoffset, fd->entrylength);
+		hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
 	}
 }
 
 void hfsplus_ext_write_extent(struct inode *inode)
 {
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+	if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
 		struct hfs_find_data fd;
 
 		hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
@@ -136,30 +141,34 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 
 static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+	if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
 		__hfsplus_ext_write_extent(inode, fd);
 
-	res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
-					block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+					block, HFSPLUS_IS_RSRC(inode) ?
+						HFSPLUS_TYPE_RSRC :
+						HFSPLUS_TYPE_DATA);
 	if (!res) {
-		HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
-		HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+		hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+		hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
 	} else {
-		HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 	}
 	return res;
 }
 
 static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfs_find_data fd;
 	int res;
 
-	if (block >= HFSPLUS_I(inode).cached_start &&
-	    block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+	if (block >= hip->cached_start &&
+	    block < hip->cached_start + hip->cached_blocks)
 		return 0;
 
 	hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
@@ -174,6 +183,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 {
 	struct super_block *sb = inode->i_sb;
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res = -EIO;
 	u32 ablock, dblock, mask;
 	int shift;
@@ -182,10 +192,10 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
 	ablock = iblock >> sbi->fs_shift;
 
-	if (iblock >= HFSPLUS_I(inode).fs_blocks) {
-		if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+	if (iblock >= hip->fs_blocks) {
+		if (iblock > hip->fs_blocks || !create)
 			return -EIO;
-		if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+		if (ablock >= hip->alloc_blocks) {
 			res = hfsplus_file_extend(inode);
 			if (res)
 				return res;
@@ -193,24 +203,24 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	} else
 		create = 0;
 
-	if (ablock < HFSPLUS_I(inode).first_blocks) {
-		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+	if (ablock < hip->first_blocks) {
+		dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
 		goto done;
 	}
 
 	if (inode->i_ino == HFSPLUS_EXT_CNID)
 		return -EIO;
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&hip->extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
-		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
-					     HFSPLUS_I(inode).cached_start);
+		dblock = hfsplus_ext_find_block(hip->cached_extents,
+						ablock - hip->cached_start);
 	} else {
-		mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+		mutex_unlock(&hip->extents_lock);
 		return -EIO;
 	}
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 
 done:
 	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
@@ -218,8 +228,8 @@ done:
 	map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
 	if (create) {
 		set_buffer_new(bh_result);
-		HFSPLUS_I(inode).phys_size += sb->s_blocksize;
-		HFSPLUS_I(inode).fs_blocks++;
+		hip->phys_size += sb->s_blocksize;
+		hip->fs_blocks++;
 		inode_add_bytes(inode, sb->s_blocksize);
 		mark_inode_dirty(inode);
 	}
@@ -348,6 +358,7 @@ int hfsplus_file_extend(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 start, len, goal;
 	int res;
 
@@ -360,17 +371,17 @@ int hfsplus_file_extend(struct inode *inode)
 		return -ENOSPC;
 	}
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
-	if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
-		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+	mutex_lock(&hip->extents_lock);
+	if (hip->alloc_blocks == hip->first_blocks)
+		goal = hfsplus_ext_lastblock(hip->first_extents);
 	else {
-		res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+		res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
 		if (res)
 			goto out;
-		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+		goal = hfsplus_ext_lastblock(hip->cached_extents);
 	}
 
-	len = HFSPLUS_I(inode).clump_blocks;
+	len = hip->clump_blocks;
 	start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
 	if (start >= sbi->total_blocks) {
 		start = hfsplus_block_allocate(sb, goal, 0, &len);
@@ -381,41 +392,41 @@ int hfsplus_file_extend(struct inode *inode)
 	}
 
 	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
-	if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
-		if (!HFSPLUS_I(inode).first_blocks) {
+
+	if (hip->alloc_blocks <= hip->first_blocks) {
+		if (!hip->first_blocks) {
 			dprint(DBG_EXTENT, "first extents\n");
 			/* no extents yet */
-			HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
-			HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+			hip->first_extents[0].start_block = cpu_to_be32(start);
+			hip->first_extents[0].block_count = cpu_to_be32(len);
 			res = 0;
 		} else {
 			/* try to append to extents in inode */
-			res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
-						 HFSPLUS_I(inode).alloc_blocks,
+			res = hfsplus_add_extent(hip->first_extents,
+						 hip->alloc_blocks,
 						 start, len);
 			if (res == -ENOSPC)
 				goto insert_extent;
 		}
 		if (!res) {
-			hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-			HFSPLUS_I(inode).first_blocks += len;
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks += len;
 		}
 	} else {
-		res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
-					 HFSPLUS_I(inode).alloc_blocks -
-					 HFSPLUS_I(inode).cached_start,
+		res = hfsplus_add_extent(hip->cached_extents,
+					 hip->alloc_blocks - hip->cached_start,
 					 start, len);
 		if (!res) {
-			hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-			HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
-			HFSPLUS_I(inode).cached_blocks += len;
+			hfsplus_dump_extent(hip->cached_extents);
+			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->cached_blocks += len;
 		} else if (res == -ENOSPC)
 			goto insert_extent;
 	}
 out:
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 	if (!res) {
-		HFSPLUS_I(inode).alloc_blocks += len;
+		hip->alloc_blocks += len;
 		mark_inode_dirty(inode);
 	}
 	return res;
@@ -424,13 +435,13 @@ insert_extent:
 	dprint(DBG_EXTENT, "insert new extent\n");
 	hfsplus_ext_write_extent(inode);
 
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
-	HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
-	hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-	HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
-	HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
-	HFSPLUS_I(inode).cached_blocks = len;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_extents[0].start_block = cpu_to_be32(start);
+	hip->cached_extents[0].block_count = cpu_to_be32(len);
+	hfsplus_dump_extent(hip->cached_extents);
+	hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+	hip->cached_start = hip->alloc_blocks;
+	hip->cached_blocks = len;
 
 	res = 0;
 	goto out;
@@ -439,13 +450,15 @@ insert_extent:
 void hfsplus_file_truncate(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfs_find_data fd;
 	u32 alloc_cnt, blk_cnt, start;
 	int res;
 
-	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
-	       (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
-	if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+		inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+	if (inode->i_size > hip->phys_size) {
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
@@ -462,48 +475,48 @@ void hfsplus_file_truncate(struct inode *inode)
 			return;
 		mark_inode_dirty(inode);
 		return;
-	} else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+	} else if (inode->i_size == hip->phys_size)
 		return;
 
 	blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
 			HFSPLUS_SB(sb)->alloc_blksz_shift;
-	alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+	alloc_cnt = hip->alloc_blocks;
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&hip->extents_lock);
 	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	while (1) {
-		if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
-			hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+		if (alloc_cnt == hip->first_blocks) {
+			hfsplus_free_extents(sb, hip->first_extents,
 					     alloc_cnt, alloc_cnt - blk_cnt);
-			hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-			HFSPLUS_I(inode).first_blocks = blk_cnt;
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks = blk_cnt;
 			break;
 		}
 		res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
 		if (res)
 			break;
-		start = HFSPLUS_I(inode).cached_start;
-		hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+		start = hip->cached_start;
+		hfsplus_free_extents(sb, hip->cached_extents,
 				     alloc_cnt - start, alloc_cnt - blk_cnt);
-		hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+		hfsplus_dump_extent(hip->cached_extents);
 		if (blk_cnt > start) {
-			HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
 			break;
 		}
 		alloc_cnt = start;
-		HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 
-	HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+	hip->alloc_blocks = blk_cnt;
 out:
-	HFSPLUS_I(inode).phys_size = inode->i_size;
-	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+	hip->phys_size = inode->i_size;
+	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
 	mark_inode_dirty(inode);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 55f42b49f0f1..169cef964b51 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -187,8 +187,8 @@ struct hfsplus_inode_info {
 #define HFSPLUS_FLG_EXT_DIRTY	0x0002
 #define HFSPLUS_FLG_EXT_NEW	0x0004
 
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
 
 struct hfs_find_data {
 	/* filled by caller */
@@ -379,13 +379,11 @@ static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
-/*
+
 static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 {
 	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
 }
-*/
-#define HFSPLUS_I(inode)	(*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
 
 #if 1
 #define hfsplus_kmap(p)		({ struct page *__p = (p); kmap(__p); })
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 050eee6c81bd..309defb4f41f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = NULL;
 	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfsplus_get_block,
-				&HFSPLUS_I(mapping->host).phys_size);
+				&HFSPLUS_I(mapping->host)->phys_size);
 	if (unlikely(ret)) {
 		loff_t isize = mapping->host->i_size;
 		if (pos + len > isize)
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	struct hfs_find_data fd;
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
+	struct hfsplus_inode_info *hip;
 	int err;
 
 	if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
 		goto out;
 
-	inode = HFSPLUS_I(dir).rsrc_inode;
+	inode = HFSPLUS_I(dir)->rsrc_inode;
 	if (inode)
 		goto out;
 
@@ -185,10 +186,11 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	hip = HFSPLUS_I(inode);
 	inode->i_ino = dir->i_ino;
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	hip->flags = HFSPLUS_FLG_RSRC;
 
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
@@ -199,8 +201,8 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 		iput(inode);
 		return ERR_PTR(err);
 	}
-	HFSPLUS_I(inode).rsrc_inode = dir;
-	HFSPLUS_I(dir).rsrc_inode = inode;
+	hip->rsrc_inode = dir;
+	HFSPLUS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
 	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb)->rsrc_inodes);
 	mark_inode_dirty(inode);
@@ -231,8 +233,8 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
 		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
 	inode->i_mode = mode;
 
-	HFSPLUS_I(inode).rootflags = perms->rootflags;
-	HFSPLUS_I(inode).userflags = perms->userflags;
+	HFSPLUS_I(inode)->rootflags = perms->rootflags;
+	HFSPLUS_I(inode)->userflags = perms->userflags;
 	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
@@ -253,20 +255,20 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->rootflags |= HFSPLUS_FLG_APPEND;
 	else
 		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	perms->userflags = HFSPLUS_I(inode).userflags;
+	perms->userflags = HFSPLUS_I(inode)->userflags;
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
 	perms->group = cpu_to_be32(inode->i_gid);
-	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+	perms->dev = cpu_to_be32(HFSPLUS_I(inode)->dev);
 }
 
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
-		inode = HFSPLUS_I(inode).rsrc_inode;
+		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 		return -EOVERFLOW;
-	atomic_inc(&HFSPLUS_I(inode).opencnt);
+	atomic_inc(&HFSPLUS_I(inode)->opencnt);
 	return 0;
 }
 
@@ -275,8 +277,8 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	struct super_block *sb = inode->i_sb;
 
 	if (HFSPLUS_IS_RSRC(inode))
-		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
@@ -362,6 +364,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct inode *inode = new_inode(sb);
+	struct hfsplus_inode_info *hip;
 
 	if (!inode)
 		return NULL;
@@ -372,19 +375,21 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	inode->i_gid = current_fsgid();
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-	HFSPLUS_I(inode).flags = 0;
-	memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).alloc_blocks = 0;
-	HFSPLUS_I(inode).first_blocks = 0;
-	HFSPLUS_I(inode).cached_start = 0;
-	HFSPLUS_I(inode).cached_blocks = 0;
-	HFSPLUS_I(inode).phys_size = 0;
-	HFSPLUS_I(inode).fs_blocks = 0;
-	HFSPLUS_I(inode).rsrc_inode = NULL;
+
+	hip = HFSPLUS_I(inode);
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	atomic_set(&hip->opencnt, 0);
+	hip->flags = 0;
+	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->alloc_blocks = 0;
+	hip->first_blocks = 0;
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+	hip->phys_size = 0;
+	hip->fs_blocks = 0;
+	hip->rsrc_inode = NULL;
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_size = 2;
 		sbi->folder_count++;
@@ -395,12 +400,12 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 		inode->i_op = &hfsplus_file_inode_operations;
 		inode->i_fop = &hfsplus_file_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = sbi->data_clump_blocks;
+		hip->clump_blocks = sbi->data_clump_blocks;
 	} else if (S_ISLNK(inode->i_mode)) {
 		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = 1;
+		hip->clump_blocks = 1;
 	} else
 		sbi->file_count++;
 	insert_inode_hash(inode);
@@ -436,26 +441,27 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
 	struct super_block *sb = inode->i_sb;
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 count;
 	int i;
 
-	memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
-	       sizeof(hfsplus_extent_rec));
+	memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
 	for (count = 0, i = 0; i < 8; i++)
 		count += be32_to_cpu(fork->extents[i].block_count);
-	HFSPLUS_I(inode).first_blocks = count;
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).cached_start = 0;
-	HFSPLUS_I(inode).cached_blocks = 0;
-
-	HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
-	inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
-	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
-	HFSPLUS_I(inode).clump_blocks =
+	hip->first_blocks = count;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+
+	hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+	hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+	hip->fs_blocks =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hip->clump_blocks =
 		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
-	if (!HFSPLUS_I(inode).clump_blocks) {
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+	if (!hip->clump_blocks) {
+		hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
 			sbi->rsrc_clump_blocks :
 			sbi->data_clump_blocks;
 	}
@@ -463,10 +469,10 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 
 void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
-	memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
 	       sizeof(hfsplus_extent_rec));
 	fork->total_size = cpu_to_be64(inode->i_size);
-	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
 }
 
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -477,7 +483,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 
-	HFSPLUS_I(inode).dev = 0;
+	HFSPLUS_I(inode)->dev = 0;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 
@@ -491,8 +497,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_atime = hfsp_mt2ut(folder->access_date);
 		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
-		HFSPLUS_I(inode).create_date = folder->create_date;
-		HFSPLUS_I(inode).fs_blocks = 0;
+		HFSPLUS_I(inode)->create_date = folder->create_date;
+		HFSPLUS_I(inode)->fs_blocks = 0;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (type == HFSPLUS_FILE) {
@@ -523,7 +529,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_atime = hfsp_mt2ut(file->access_date);
 		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
-		HFSPLUS_I(inode).create_date = file->create_date;
+		HFSPLUS_I(inode)->create_date = file->create_date;
 	} else {
 		printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
 		res = -EIO;
@@ -538,7 +544,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 	hfsplus_cat_entry entry;
 
 	if (HFSPLUS_IS_RSRC(inode))
-		main_inode = HFSPLUS_I(inode).rsrc_inode;
+		main_inode = HFSPLUS_I(inode)->rsrc_inode;
 
 	if (!main_inode->i_nlink)
 		return 0;
@@ -582,9 +588,9 @@ int hfsplus_cat_write_inode(struct inode *inode)
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
 		if (S_ISREG(inode->i_mode))
-			HFSPLUS_I(inode).dev = inode->i_nlink;
+			HFSPLUS_I(inode)->dev = inode->i_nlink;
 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-			HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
+			HFSPLUS_I(inode)->dev = kdev_t_to_nr(inode->i_rdev);
 		hfsplus_set_perms(inode, &file->permissions);
 		if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 66dd64b88b2e..c9ac443d202a 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -23,13 +23,14 @@
 static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags = 0;
 
-	if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
+	if (hip->rootflags & HFSPLUS_FLG_IMMUTABLE)
 		flags |= FS_IMMUTABLE_FL;
-	if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
+	if (hip->rootflags & HFSPLUS_FLG_APPEND)
 		flags |= FS_APPEND_FL;
-	if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
+	if (hip->userflags & HFSPLUS_FLG_NODUMP)
 		flags |= FS_NODUMP_FL;
 
 	return put_user(flags, user_flags);
@@ -38,6 +39,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags;
 	int err = 0;
 
@@ -58,7 +60,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	mutex_lock(&inode->i_mutex);
 
 	if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-	    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
+	    hip->rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
 			err = -EPERM;
 			goto out_unlock_inode;
@@ -72,22 +74,22 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	}
 	if (flags & FS_IMMUTABLE_FL) {
 		inode->i_flags |= S_IMMUTABLE;
-		HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
+		hip->rootflags |= HFSPLUS_FLG_IMMUTABLE;
 	} else {
 		inode->i_flags &= ~S_IMMUTABLE;
-		HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
+		hip->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
 	}
 	if (flags & FS_APPEND_FL) {
 		inode->i_flags |= S_APPEND;
-		HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
+		hip->rootflags |= HFSPLUS_FLG_APPEND;
 	} else {
 		inode->i_flags &= ~S_APPEND;
-		HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+		hip->rootflags &= ~HFSPLUS_FLG_APPEND;
 	}
 	if (flags & FS_NODUMP_FL)
-		HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
+		hip->userflags |= HFSPLUS_FLG_NODUMP;
 	else
-		HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
+		hip->userflags &= ~HFSPLUS_FLG_NODUMP;
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1bf00b9ecc1a..a7bf89e85b3b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -33,11 +33,11 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	HFSPLUS_I(inode).flags = 0;
-	HFSPLUS_I(inode).rsrc_inode = NULL;
-	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
+	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+	mutex_init(&HFSPLUS_I(inode)->extents_lock);
+	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->rsrc_inode = NULL;
+	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
 
 	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
 	read_inode:
@@ -151,8 +151,8 @@ static void hfsplus_evict_inode(struct inode *inode)
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
 	if (HFSPLUS_IS_RSRC(inode)) {
-		HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
-		iput(HFSPLUS_I(inode).rsrc_inode);
+		HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFSPLUS_I(inode)->rsrc_inode);
 	}
 }
 
@@ -491,7 +491,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 
 static void hfsplus_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
-- 
cgit v1.2.3


From fc4fff82104fa096eada73943fe5249500acd5fa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:43:41 +0200
Subject: hfsplus: clean up hfsplus_iget

Add a new hfsplus_system_read_inode for reading the special system inodes
and streamline the fastpath iget code.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 77 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index a7bf89e85b3b..bd09ea23435b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -20,12 +20,42 @@ static void hfsplus_destroy_inode(struct inode *inode);
 
 #include "hfsplus_fs.h"
 
+static int hfsplus_system_read_inode(struct inode *inode)
+{
+	struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
+
+	switch (inode->i_ino) {
+	case HFSPLUS_EXT_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->ext_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	case HFSPLUS_CAT_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->cat_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	case HFSPLUS_ALLOC_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->alloc_file);
+		inode->i_mapping->a_ops = &hfsplus_aops;
+		break;
+	case HFSPLUS_START_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->start_file);
+		break;
+	case HFSPLUS_ATTR_CNID:
+		hfsplus_inode_read_fork(inode, &vhdr->attr_file);
+		inode->i_mapping->a_ops = &hfsplus_btree_aops;
+		break;
+	default:
+		return -EIO;
+	}
+
+	return 0;
+}
+
 struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 {
 	struct hfs_find_data fd;
-	struct hfsplus_vh *vhdr;
 	struct inode *inode;
-	long err = -EIO;
+	int err;
 
 	inode = iget_locked(sb, ino);
 	if (!inode)
@@ -39,51 +69,24 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 	HFSPLUS_I(inode)->rsrc_inode = NULL;
 	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
 
-	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-	read_inode:
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID) {
 		hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
 		if (!err)
 			err = hfsplus_cat_read_inode(inode, &fd);
 		hfs_find_exit(&fd);
-		if (err)
-			goto bad_inode;
-		goto done;
+	} else {
+		err = hfsplus_system_read_inode(inode);
 	}
-	vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
-	switch(inode->i_ino) {
-	case HFSPLUS_ROOT_CNID:
-		goto read_inode;
-	case HFSPLUS_EXT_CNID:
-		hfsplus_inode_read_fork(inode, &vhdr->ext_file);
-		inode->i_mapping->a_ops = &hfsplus_btree_aops;
-		break;
-	case HFSPLUS_CAT_CNID:
-		hfsplus_inode_read_fork(inode, &vhdr->cat_file);
-		inode->i_mapping->a_ops = &hfsplus_btree_aops;
-		break;
-	case HFSPLUS_ALLOC_CNID:
-		hfsplus_inode_read_fork(inode, &vhdr->alloc_file);
-		inode->i_mapping->a_ops = &hfsplus_aops;
-		break;
-	case HFSPLUS_START_CNID:
-		hfsplus_inode_read_fork(inode, &vhdr->start_file);
-		break;
-	case HFSPLUS_ATTR_CNID:
-		hfsplus_inode_read_fork(inode, &vhdr->attr_file);
-		inode->i_mapping->a_ops = &hfsplus_btree_aops;
-		break;
-	default:
-		goto bad_inode;
+
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
 	}
 
-done:
 	unlock_new_inode(inode);
 	return inode;
-
-bad_inode:
-	iget_failed(inode);
-	return ERR_PTR(err);
 }
 
 static int hfsplus_write_inode(struct inode *inode,
-- 
cgit v1.2.3


From b5080f77ed2de3c8ac67a63044f8a781c75207d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:43:43 +0200
Subject: hfsplus: clean up hfsplus_write_inode

Add a new hfsplus_system_write_inode for writing the special system inodes
and streamline the fastpath write_inode code.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 80 +++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index bd09ea23435b..e485a38b994d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -89,63 +89,57 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 	return inode;
 }
 
-static int hfsplus_write_inode(struct inode *inode,
-		struct writeback_control *wbc)
+static int hfsplus_system_write_inode(struct inode *inode)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
-	struct hfsplus_vh *vhdr;
-	int ret = 0;
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	struct hfsplus_fork_raw *fork;
+	struct hfs_btree *tree = NULL;
 
-	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
-	hfsplus_ext_write_extent(inode);
-	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-		return hfsplus_cat_write_inode(inode);
-	}
-	vhdr = sbi->s_vhdr;
 	switch (inode->i_ino) {
-	case HFSPLUS_ROOT_CNID:
-		ret = hfsplus_cat_write_inode(inode);
-		break;
 	case HFSPLUS_EXT_CNID:
-		if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
-			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-		hfs_btree_write(sbi->ext_tree);
+		fork = &vhdr->ext_file;
+		tree = sbi->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
-		if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
-			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-		hfs_btree_write(sbi->cat_tree);
+		fork = &vhdr->cat_file;
+		tree = sbi->cat_tree;
 		break;
 	case HFSPLUS_ALLOC_CNID:
-		if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
-			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
+		fork = &vhdr->alloc_file;
 		break;
 	case HFSPLUS_START_CNID:
-		if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
-			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->start_file);
+		fork = &vhdr->start_file;
 		break;
 	case HFSPLUS_ATTR_CNID:
-		if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
-			sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->attr_file);
-		hfs_btree_write(sbi->attr_tree);
-		break;
+		fork = &vhdr->attr_file;
+		tree = sbi->attr_tree;
+	default:
+		return -EIO;
+	}
+
+	if (fork->total_size != cpu_to_be64(inode->i_size)) {
+		sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
+		inode->i_sb->s_dirt = 1;
 	}
-	return ret;
+	hfsplus_inode_write_fork(inode, fork);
+	if (tree)
+		hfs_btree_write(tree);
+	return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
+{
+	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+	hfsplus_ext_write_extent(inode);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID)
+		return hfsplus_cat_write_inode(inode);
+	else
+		return hfsplus_system_write_inode(inode);
 }
 
 static void hfsplus_evict_inode(struct inode *inode)
-- 
cgit v1.2.3


From 30d3abbec730a5a9c954a6342271f7a7db155b08 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:43:50 +0200
Subject: hfsplus: merge mknod/mkdir/creat

Make hfsplus_mkdir and hfsplus_create call hfsplus_mknod instead of
duplicating the code.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c | 64 ++++++++++++++------------------------------------------
 1 file changed, 16 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5ca8308f9eb7..7efcf75ea73a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -237,28 +237,6 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
-			  struct nameidata *nd)
-{
-	struct inode *inode;
-	int res;
-
-	inode = hfsplus_new_inode(dir->i_sb, mode);
-	if (!inode)
-		return -ENOSPC;
-
-	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
-	hfsplus_instantiate(dentry, inode, inode->i_ino);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
 static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			struct dentry *dst_dentry)
 {
@@ -365,27 +343,6 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 	return res;
 }
 
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct inode *inode;
-	int res;
-
-	inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
-	if (!inode)
-		return -ENOSPC;
-
-	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
-	hfsplus_instantiate(dentry, inode, inode->i_ino);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
 static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode;
@@ -438,12 +395,10 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 			 int mode, dev_t rdev)
 {
-	struct super_block *sb;
 	struct inode *inode;
 	int res;
 
-	sb = dir->i_sb;
-	inode = hfsplus_new_inode(sb, mode);
+	inode = hfsplus_new_inode(dir->i_sb, mode);
 	if (!inode)
 		return -ENOSPC;
 
@@ -454,13 +409,26 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 		iput(inode);
 		return res;
 	}
-	init_special_inode(inode, mode, rdev);
+
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+		init_special_inode(inode, mode, rdev);
+
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
-
 	return 0;
 }
 
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+			  struct nameidata *nd)
+{
+	return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
+}
+
 static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry)
 {
-- 
cgit v1.2.3


From f17c89bfcc9cccd405098eac3ec1ebfddf03279e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:43:54 +0200
Subject: hfsplus: fix error handling in hfsplus_symlink

We need to free the inode again on a hfsplus_create_cat failure.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 7efcf75ea73a..f8ae468f4ab6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -364,31 +364,29 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 			   const char *symname)
 {
-	struct super_block *sb;
 	struct inode *inode;
 	int res;
 
-	sb = dir->i_sb;
-	inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
 	if (!inode)
 		return -ENOSPC;
 
 	res = page_symlink(inode, symname, strlen(symname) + 1);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
+	if (res)
+		goto out_err;
 
-	mark_inode_dirty(inode);
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res)
+		goto out_err;
 
-	if (!res) {
-		hfsplus_instantiate(dentry, inode, inode->i_ino);
-		mark_inode_dirty(inode);
-	}
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+	return 0;
 
+out_err:
+	inode->i_nlink = 0;
+	hfsplus_delete_inode(inode);
+	iput(inode);
 	return res;
 }
 
-- 
cgit v1.2.3


From 66e5db05bb6670f314d90aba5998e6a033e4d563 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:43:58 +0200
Subject: hfsplus: do not cache and write next_alloc

We never look at it, nor change the next_alloc field in the superblock.  So
don't bother caching it or writing it out in hfsplus_sync_fs.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h | 1 -
 fs/hfsplus/super.c      | 2 --
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 169cef964b51..5e2418a71887 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -129,7 +129,6 @@ struct hfsplus_sb_info {
 	int alloc_blksz_shift;
 	u32 total_blocks;
 	u32 free_blocks;
-	u32 next_alloc;
 	u32 next_cnid;
 	u32 file_count;
 	u32 folder_count;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e485a38b994d..f310a1fdcd5f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -164,7 +164,6 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 	sb->s_dirt = 0;
 
 	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
-	vhdr->next_alloc = cpu_to_be32(sbi->next_alloc);
 	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
 	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
 	vhdr->file_count = cpu_to_be32(sbi->file_count);
@@ -355,7 +354,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
 	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
-	sbi->next_alloc = be32_to_cpu(vhdr->next_alloc);
 	sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
 	sbi->file_count = be32_to_cpu(vhdr->file_count);
 	sbi->folder_count = be32_to_cpu(vhdr->folder_count);
-- 
cgit v1.2.3


From 58a818f532e83f337689358c102ba2048d1b37f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:44:02 +0200
Subject: hfsplus: remove the rsrc_inodes list

We never walk the list - the only reason for it is to make the resource fork
inodes appear hashed to the writeback code.  Borrow a trick from JFS to do
that without needing a list head.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h |  2 --
 fs/hfsplus/inode.c      | 10 +++++++++-
 fs/hfsplus/super.c      |  1 -
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5e2418a71887..0cd9ba00f968 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -145,8 +145,6 @@ struct hfsplus_sb_info {
 	int part, session;
 
 	unsigned long flags;
-
-	struct hlist_head rsrc_inodes;
 };
 
 #define HFSPLUS_SB_WRITEBACKUP	0x0001
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 309defb4f41f..a05b3afa7230 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -204,7 +204,15 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	hip->rsrc_inode = dir;
 	HFSPLUS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb)->rsrc_inodes);
+
+	/*
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want resource fork inodes in the regular inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
+	 * will work fine and require no locking.
+	 */
+	inode->i_hash.pprev = &inode->i_hash.next;
+
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f310a1fdcd5f..923f385b32ca 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -318,7 +318,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
-	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
 	mutex_init(&sbi->alloc_mutex);
 	hfsplus_fill_defaults(sbi);
 	if (!hfsplus_parse_options(data, sbi)) {
-- 
cgit v1.2.3


From 7ac9fb9c2a50963b699b3548e6f00698c1554dc6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:45:08 +0200
Subject: hfsplus: add per-superblock lock for volume header updates

Lock updates to the mutal fields in the volume header, and document the
locing in the hfsplus_sb_info structure.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c        | 54 ++++++++++++++++++++++++++++++++-----------------
 fs/hfsplus/hfsplus_fs.h | 13 +++++++-----
 fs/hfsplus/super.c      |  7 +++++++
 3 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f8ae468f4ab6..1c81eedcab01 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -251,6 +251,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
 
+	mutex_lock(&sbi->vh_mutex);
 	if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
 		for (;;) {
 			get_random_bytes(&id, sizeof(cnid));
@@ -263,7 +264,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			if (!res)
 				break;
 			if (res != -EEXIST)
-				return res;
+				goto out;
 		}
 		HFSPLUS_I(inode)->dev = id;
 		cnid = sbi->next_cnid++;
@@ -271,13 +272,13 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
 		if (res)
 			/* panic? */
-			return res;
+			goto out;
 		sbi->file_count++;
 	}
 	cnid = sbi->next_cnid++;
 	res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
 	if (res)
-		return res;
+		goto out;
 
 	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
@@ -286,8 +287,9 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 	mark_inode_dirty(inode);
 	sbi->file_count++;
 	dst_dir->i_sb->s_dirt = 1;
-
-	return 0;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
@@ -302,6 +304,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
 
+	mutex_lock(&sbi->vh_mutex);
 	cnid = (u32)(unsigned long)dentry->d_fsdata;
 	if (inode->i_ino == cnid &&
 	    atomic_read(&HFSPLUS_I(inode)->opencnt)) {
@@ -312,11 +315,11 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 					 sbi->hidden_dir, &str);
 		if (!res)
 			inode->i_flags |= S_DEAD;
-		return res;
+		goto out;
 	}
 	res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
 	if (res)
-		return res;
+		goto out;
 
 	if (inode->i_nlink > 0)
 		drop_nlink(inode);
@@ -339,37 +342,44 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		sbi->file_count--;
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-
+out:
+	mutex_unlock(&sbi->vh_mutex);
 	return res;
 }
 
 static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	struct inode *inode;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
 	int res;
 
-	inode = dentry->d_inode;
 	if (inode->i_size != 2)
 		return -ENOTEMPTY;
+
+	mutex_lock(&sbi->vh_mutex);
 	res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
 	if (res)
-		return res;
+		goto out;
 	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	hfsplus_delete_inode(inode);
 	mark_inode_dirty(inode);
-	return 0;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 			   const char *symname)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
-	int res;
+	int res = -ENOSPC;
 
+	mutex_lock(&sbi->vh_mutex);
 	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
 	if (!inode)
-		return -ENOSPC;
+		goto out;
 
 	res = page_symlink(inode, symname, strlen(symname) + 1);
 	if (res)
@@ -381,31 +391,35 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
-	return 0;
+	goto out;
 
 out_err:
 	inode->i_nlink = 0;
 	hfsplus_delete_inode(inode);
 	iput(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
 	return res;
 }
 
 static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 			 int mode, dev_t rdev)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
-	int res;
+	int res = -ENOSPC;
 
+	mutex_lock(&sbi->vh_mutex);
 	inode = hfsplus_new_inode(dir->i_sb, mode);
 	if (!inode)
-		return -ENOSPC;
+		goto out;
 
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
 		inode->i_nlink = 0;
 		hfsplus_delete_inode(inode);
 		iput(inode);
-		return res;
+		goto out;
 	}
 
 	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
@@ -413,7 +427,9 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
-	return 0;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 0cd9ba00f968..08865ed70f00 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -116,23 +116,26 @@ struct hfsplus_sb_info {
 	struct inode *hidden_dir;
 	struct nls_table *nls;
 
-	/* synchronize block allocations */
-	struct mutex alloc_mutex;
-
 	/* Runtime variables */
 	u32 blockoffset;
 	u32 sect_count;
 	int fs_shift;
 
-	/* Stuff in host order from Vol Header */
+	/* immutable data from the volume header */
 	u32 alloc_blksz;
 	int alloc_blksz_shift;
 	u32 total_blocks;
+	u32 data_clump_blocks, rsrc_clump_blocks;
+
+	/* mutable data from the volume header, protected by alloc_mutex */
 	u32 free_blocks;
+	struct mutex alloc_mutex;
+
+	/* mutable data from the volume header, protected by vh_mutex */
 	u32 next_cnid;
 	u32 file_count;
 	u32 folder_count;
-	u32 data_clump_blocks, rsrc_clump_blocks;
+	struct mutex vh_mutex;
 
 	/* Config options */
 	u32 creator;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 923f385b32ca..b766c170e4d8 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -160,6 +160,7 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
+	mutex_lock(&sbi->vh_mutex);
 	mutex_lock(&sbi->alloc_mutex);
 	sb->s_dirt = 0;
 
@@ -194,6 +195,7 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 		sbi->flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
 	mutex_unlock(&sbi->alloc_mutex);
+	mutex_unlock(&sbi->vh_mutex);
 	return 0;
 }
 
@@ -319,6 +321,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_fs_info = sbi;
 	mutex_init(&sbi->alloc_mutex);
+	mutex_init(&sbi->vh_mutex);
 	hfsplus_fill_defaults(sbi);
 	if (!hfsplus_parse_options(data, sbi)) {
 		printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -453,9 +456,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!sbi->hidden_dir) {
 		printk(KERN_DEBUG "hfs: create hidden dir...\n");
+
+		mutex_lock(&sbi->vh_mutex);
 		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
 		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
 				   &str, sbi->hidden_dir);
+		mutex_unlock(&sbi->vh_mutex);
+
 		mark_inode_dirty(sbi->hidden_dir);
 	}
 out:
-- 
cgit v1.2.3


From 84adede31267af37141da2b2b04293c5ea8af7ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:45:20 +0200
Subject: hfsplus: use atomic bitops for the superblock flags

The flags in the HFS+-specific superlock do get modified during runtime,
use atomic bitops to make the modifications SMP safe.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/btree.c      |  4 ++--
 fs/hfsplus/hfsplus_fs.h | 10 +++++-----
 fs/hfsplus/options.c    |  8 ++++----
 fs/hfsplus/super.c      | 10 ++++------
 fs/hfsplus/unicode.c    | 12 ++++++------
 fs/hfsplus/wrapper.c    | 14 +++++++++-----
 6 files changed, 30 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index f75cf222b8b4..27504f57a485 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -61,12 +61,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	if (id == HFSPLUS_EXT_CNID) {
 		tree->keycmp = hfsplus_ext_cmp_key;
 	} else if (id == HFSPLUS_CAT_CNID) {
-		if ((HFSPLUS_SB(sb)->flags & HFSPLUS_SB_HFSX) &&
+		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
 		    (head->key_type == HFSPLUS_KEY_BINARY))
 			tree->keycmp = hfsplus_cat_bin_cmp_key;
 		else {
 			tree->keycmp = hfsplus_cat_case_cmp_key;
-			HFSPLUS_SB(sb)->flags |= HFSPLUS_SB_CASEFOLD;
+			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 		}
 	} else {
 		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 08865ed70f00..c521d44f0747 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -150,11 +150,11 @@ struct hfsplus_sb_info {
 	unsigned long flags;
 };
 
-#define HFSPLUS_SB_WRITEBACKUP	0x0001
-#define HFSPLUS_SB_NODECOMPOSE	0x0002
-#define HFSPLUS_SB_FORCE	0x0004
-#define HFSPLUS_SB_HFSX		0x0008
-#define HFSPLUS_SB_CASEFOLD	0x0010
+#define HFSPLUS_SB_WRITEBACKUP	0
+#define HFSPLUS_SB_NODECOMPOSE	1
+#define HFSPLUS_SB_FORCE	2
+#define HFSPLUS_SB_HFSX		3
+#define HFSPLUS_SB_CASEFOLD	4
 
 
 struct hfsplus_inode_info {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index f2598aef206e..f9ab276a4d8d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 			kfree(p);
 			break;
 		case opt_decompose:
-			sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+			clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
 		case opt_nodecompose:
-			sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+			set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
 		case opt_force:
-			sbi->flags |= HFSPLUS_SB_FORCE;
+			set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
 			break;
 		default:
 			return 0;
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",session=%u", sbi->session);
 	if (sbi->nls)
 		seq_printf(seq, ",nls=%s", sbi->nls->charset);
-	if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+	if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
 		seq_printf(seq, ",nodecompose");
 	return 0;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index b766c170e4d8..9a88d7536103 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -119,7 +119,7 @@ static int hfsplus_system_write_inode(struct inode *inode)
 	}
 
 	if (fork->total_size != cpu_to_be64(inode->i_size)) {
-		sbi->flags |= HFSPLUS_SB_WRITEBACKUP;
+		set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
 		inode->i_sb->s_dirt = 1;
 	}
 	hfsplus_inode_write_fork(inode, fork);
@@ -170,7 +170,7 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 	vhdr->file_count = cpu_to_be32(sbi->file_count);
 
 	mark_buffer_dirty(sbi->s_vhbh);
-	if (sbi->flags & HFSPLUS_SB_WRITEBACKUP) {
+	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
 		if (sbi->sect_count) {
 			struct buffer_head *bh;
 			u32 block, offset;
@@ -192,7 +192,6 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 					printk(KERN_WARNING "hfs: backup not found!\n");
 			}
 		}
-		sbi->flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
 	mutex_unlock(&sbi->alloc_mutex);
 	mutex_unlock(&sbi->vh_mutex);
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 			       "running fsck.hfsplus is recommended.  leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
-		} else if (sbi.flags & HFSPLUS_SB_FORCE) {
+		} else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
 			/* nothing */
 		} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -376,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
 		       "running fsck.hfsplus is recommended.  mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
-	} else if (sbi->flags & HFSPLUS_SB_FORCE) {
+	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -386,7 +385,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		       "use the force option at your own risk, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
-	sbi->flags &= ~HFSPLUS_SB_FORCE;
 
 	/* Load metadata objects (B*Trees) */
 	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index a15fcffce74f..b66d67de882c 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
 	ustrlen = be16_to_cpu(ustr->length);
 	len = *len_p;
 	ce1 = NULL;
-	compose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
+	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 
 	while (ustrlen > 0) {
 		c0 = be16_to_cpu(*ip++);
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 	u16 *dstr, outlen = 0;
 	wchar_t c;
 
-	decompose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
 		size = asc2unichar(sb, astr, len, &c);
 
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 	wchar_t c;
 	u16 c2;
 
-	casefold = (HFSPLUS_SB(sb)->flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	hash = init_name_hash();
 	astr = str->name;
 	len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 	u16 c1, c2;
 	wchar_t c;
 
-	casefold = (HFSPLUS_SB(sb)->flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb)->flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	astr1 = s1->name;
 	len1 = s1->len;
 	astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index c89e198e8a2a..8972c20b3216 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -123,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
 			break;
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-			sbi->flags |= HFSPLUS_SB_HFSX;
+			set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
 			break;
 		}
 		brelse(bh);
@@ -169,10 +169,14 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		return -EIO;
 
 	/* should still be the same... */
-	if (vhdr->signature != (sbi->flags & HFSPLUS_SB_HFSX ?
-				cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
-				cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
-		goto error;
+	if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+			goto error;
+	} else {
+		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
+			goto error;
+	}
+
 	sbi->s_vhbh = bh;
 	sbi->s_vhdr = vhdr;
 
-- 
cgit v1.2.3


From 89755dcace09b44b3aa024bf302d9b19b4c24cad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:45:25 +0200
Subject: hfsplus: protect readdir against removals from open_dir_list

We already have i_mutex for readdir and the namespace operations that add
entries to open_dir_list, the only thing that was missing was the removal
in hfsplus_dir_release.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 1c81eedcab01..93fa45cc4810 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -231,7 +231,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfsplus_readdir_data *rd = file->private_data;
 	if (rd) {
+		mutex_lock(&inode->i_mutex);
 		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
 		kfree(rd);
 	}
 	return 0;
-- 
cgit v1.2.3


From 7fcc99f4f2ddb1c39abc05fbb9b32f05b03c7f8f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 05:46:31 +0200
Subject: hfsplus: add missing extent locking in hfsplus_write_inode

Most of the extent handling code already does proper SMP locking, but
hfsplus_write_inode was calling into hfsplus_ext_write_extent without
taking the extents_lock.  Fix this by splitting hfsplus_ext_write_extent
into an internal helper that expects the lock, and a public interface
that first acquires it.

Also add a few locking asserts and document the locking rules in
hfsplus_fs.h.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/extents.c    | 15 +++++++++++++--
 fs/hfsplus/hfsplus_fs.h | 34 +++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index b1017eaa4fdc..0c9cb1820a52 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -88,6 +88,8 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
 	hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
 			      HFSPLUS_IS_RSRC(inode) ?
 				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
@@ -108,7 +110,7 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
 	}
 }
 
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
 	if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
 		struct hfs_find_data fd;
@@ -119,6 +121,13 @@ void hfsplus_ext_write_extent(struct inode *inode)
 	}
 }
 
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+	mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+	hfsplus_ext_write_extent_locked(inode);
+	mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
 static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 					    struct hfsplus_extent *extent,
 					    u32 cnid, u32 block, u8 type)
@@ -144,6 +153,8 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
 	if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
 		__hfsplus_ext_write_extent(inode, fd);
 
@@ -433,7 +444,7 @@ out:
 
 insert_extent:
 	dprint(DBG_EXTENT, "insert new extent\n");
-	hfsplus_ext_write_extent(inode);
+	hfsplus_ext_write_extent_locked(inode);
 
 	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
 	hip->cached_extents[0].start_block = cpu_to_be32(start);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index c521d44f0747..c007cc201279 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -158,28 +158,36 @@ struct hfsplus_sb_info {
 
 
 struct hfsplus_inode_info {
-	struct mutex extents_lock;
-	u32 clump_blocks, alloc_blocks;
-	sector_t fs_blocks;
-	/* Allocation extents from catalog record or volume header */
-	hfsplus_extent_rec first_extents;
-	u32 first_blocks;
-	hfsplus_extent_rec cached_extents;
-	u32 cached_start, cached_blocks;
 	atomic_t opencnt;
 
-	struct inode *rsrc_inode;
+	/*
+	 * Extent allocation information, protected by extents_lock.
+	 */
+	u32 first_blocks;
+	u32 clump_blocks;
+	u32 alloc_blocks;
+	u32 cached_start;
+	u32 cached_blocks;
+	hfsplus_extent_rec first_extents;
+	hfsplus_extent_rec cached_extents;
 	unsigned long flags;
+	struct mutex extents_lock;
 
+	/*
+	 * Immutable data.
+	 */
+	struct inode *rsrc_inode;
 	__be32 create_date;
-	/* Device number in hfsplus_permissions in catalog */
 	u32 dev;
-	/* BSD system and user file flags */
-	u8 rootflags;
-	u8 userflags;
 
+	/*
+	 * Protected by i_mutex.
+	 */
+	sector_t fs_blocks;
+	u8 rootflags, userflags;	/* BSD system and user file flags */
 	struct list_head open_dir_list;
 	loff_t phys_size;
+
 	struct inode vfs_inode;
 };
 
-- 
cgit v1.2.3


From 467c3d9cd541eef284ff8118069b088e015b8d6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Oct 2010 05:46:52 +0200
Subject: hfsplus: convert tree_lock to mutex

tree_lock is used as mutex so make it a mutex.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bfind.c      | 4 ++--
 fs/hfsplus/btree.c      | 2 +-
 fs/hfsplus/hfsplus_fs.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..68c7983f0289 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
 	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-	down(&tree->tree_lock);
+	mutex_lock(&tree->tree_lock);
 	return 0;
 }
 
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
 	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-	up(&fd->tree->tree_lock);
+	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 27504f57a485..8306479f6b3a 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	if (!tree)
 		return NULL;
 
-	init_MUTEX(&tree->tree_lock);
+	mutex_init(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
 	tree->sb = sb;
 	tree->cnid = id;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index c007cc201279..5cda96366acf 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
 	unsigned int depth;
 
 	//unsigned int map1_size, map_size;
-	struct semaphore tree_lock;
+	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
 	spinlock_t hash_lock;
-- 
cgit v1.2.3


From 40de9a7cebc4e0b23cd6863c84c2279f0ccadebb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 1 Oct 2010 09:12:08 +0200
Subject: hfsplus: fix rename over directories

When renaming over a directory we need to use hfsplus_rmdir instead of
hfsplus_unlink to evict the victim.  This makes sure we properly error out
on non-empty directory as required by Posix (BZ #16571), and it also makes
sure we do the right thing in case i_nlink will every be set correctly for
directories on hfsplus.

Reported-by: Vlado Plaga <rechner@vlado-do.de>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 93fa45cc4810..33aab211695a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -452,7 +452,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		res = hfsplus_unlink(new_dir, new_dentry);
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
+			res = hfsplus_rmdir(new_dir, new_dentry);
+		else
+			res = hfsplus_unlink(new_dir, new_dentry);
 		if (res)
 			return res;
 	}
-- 
cgit v1.2.3


From 2b44f1ba40914777f4b1075254ba97663d4e2574 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 30 Sep 2010 20:47:46 +0200
Subject: nfsd4: adjust buflen for encoded attrs bitmap based on actual bitmap
 length

The existing code adjusted it based on the worst case scenario for the returned
bitmap and the best case scenario for the supported attrs attribute.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[bfields@redhat.com: removed likely/unlikely's]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..f35a94a04026 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1805,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 				goto out_nfserr;
 		}
 	}
-	if ((buflen -= 16) < 0)
-		goto out_resource;
 
-	if (unlikely(bmval2)) {
+	if (bmval2) {
+		if ((buflen -= 16) < 0)
+			goto out_resource;
 		WRITE32(3);
 		WRITE32(bmval0);
 		WRITE32(bmval1);
 		WRITE32(bmval2);
-	} else if (likely(bmval1)) {
+	} else if (bmval1) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
 		WRITE32(2);
 		WRITE32(bmval0);
 		WRITE32(bmval1);
 	} else {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
 		WRITE32(1);
 		WRITE32(bmval0);
 	}
@@ -1828,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 		u32 word1 = nfsd_suppattrs1(minorversion);
 		u32 word2 = nfsd_suppattrs2(minorversion);
 
-		if ((buflen -= 12) < 0)
-			goto out_resource;
 		if (!aclsupport)
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!word2) {
+			if ((buflen -= 12) < 0)
+				goto out_resource;
 			WRITE32(2);
 			WRITE32(word0);
 			WRITE32(word1);
 		} else {
+			if ((buflen -= 16) < 0)
+				goto out_resource;
 			WRITE32(3);
 			WRITE32(word0);
 			WRITE32(word1);
-- 
cgit v1.2.3


From fc5d00b04a3a58cac8620403dfe9f43f72578ec1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 29 Sep 2010 16:03:50 +0400
Subject: sunrpc: Add net argument to svc_create_xprt

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c                  | 2 +-
 fs/nfs/callback.c               | 4 ++--
 fs/nfsd/nfsctl.c                | 4 ++--
 fs/nfsd/nfssvc.c                | 5 +++--
 include/linux/sunrpc/svc_xprt.h | 4 ++--
 net/sunrpc/svc_xprt.c           | 4 ++--
 6 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..b13aabc12298 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -206,7 +206,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
 
 	xprt = svc_find_xprt(serv, name, family, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, family, port,
+		return svc_create_xprt(serv, name, &init_net, family, port,
 						SVC_SOCK_DEFAULTS);
 	svc_xprt_put(xprt);
 	return 0;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..aeec017fe814 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -109,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
 {
 	int ret;
 
-	ret = svc_create_xprt(serv, "tcp", PF_INET,
+	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
@@ -117,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
 			nfs_callback_tcpport, PF_INET);
 
-	ret = svc_create_xprt(serv, "tcp", PF_INET6,
+	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret > 0) {
 		nfs_callback_tcpport6 = ret;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b6e192d25633..b81da24b768c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1015,12 +1015,12 @@ static ssize_t __write_ports_addxprt(char *buf)
 	if (err != 0)
 		return err;
 
-	err = svc_create_xprt(nfsd_serv, transport,
+	err = svc_create_xprt(nfsd_serv, transport, &init_net,
 				PF_INET, port, SVC_SOCK_ANONYMOUS);
 	if (err < 0)
 		goto out_err;
 
-	err = svc_create_xprt(nfsd_serv, transport,
+	err = svc_create_xprt(nfsd_serv, transport, &init_net,
 				PF_INET6, port, SVC_SOCK_ANONYMOUS);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_close;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..2bae1d86f5f2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include <net/net_namespace.h>
 #include "nfsd.h"
 #include "cache.h"
 #include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+	error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
 
-	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+	error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index e50e3eca1c7c..646263cf815d 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -74,8 +74,8 @@ int	svc_reg_xprt_class(struct svc_xprt_class *);
 void	svc_unreg_xprt_class(struct svc_xprt_class *);
 void	svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
-int	svc_create_xprt(struct svc_serv *, const char *, const int,
-			const unsigned short, int);
+int	svc_create_xprt(struct svc_serv *, const char *, struct net *,
+			const int, const unsigned short, int);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_received(struct svc_xprt *);
 void	svc_xprt_put(struct svc_xprt *xprt);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f7e8915051b1..d80789a37d88 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -204,8 +204,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 }
 
 int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
-		    const int family, const unsigned short port,
-		    int flags)
+		    struct net *net, const int family,
+		    const unsigned short port, int flags)
 {
 	struct svc_xprt_class *xcl;
 
-- 
cgit v1.2.3


From c653ce3f0aee9bb2b221ebf3579385c06f81efcd Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 29 Sep 2010 16:04:45 +0400
Subject: sunrpc: Add net to rpc_create_args

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/host.c             | 1 +
 fs/lockd/mon.c              | 1 +
 fs/nfs/client.c             | 1 +
 fs/nfs/mount_clnt.c         | 2 ++
 fs/nfsd/nfs4callback.c      | 1 +
 include/linux/sunrpc/clnt.h | 1 +
 net/sunrpc/rpcb_clnt.c      | 2 ++
 7 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..25e21e4023b2 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
 			.to_retries	= 5U,
 		};
 		struct rpc_create_args args = {
+			.net		= &init_net,
 			.protocol	= host->h_proto,
 			.address	= nlm_addr(host),
 			.addrsize	= host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..e0c918949644 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
 		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
 	};
 	struct rpc_create_args args = {
+		.net			= &init_net,
 		.protocol		= XPRT_TRANSPORT_UDP,
 		.address		= (struct sockaddr *)&sin,
 		.addrsize		= sizeof(sin),
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..351b71187b38 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -601,6 +601,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= clp->cl_proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..4b472038342b 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
 		.rpc_resp	= &result,
 	};
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= info->protocol,
 		.address	= info->sap,
 		.addrsize	= info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
 		.to_retries = 2,
 	};
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= IPPROTO_UDP,
 		.address	= info->sap,
 		.addrsize	= info->salen,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 014482c4e57d..1112f451295a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -479,6 +479,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 		.to_retries	= 0,
 	};
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= XPRT_TRANSPORT_TCP,
 		.address	= (struct sockaddr *) &cb->cb_addr,
 		.addrsize	= cb->cb_addrlen,
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 85f38a63f098..58c4473f899a 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -102,6 +102,7 @@ struct rpc_procinfo {
 #ifdef __KERNEL__
 
 struct rpc_create_args {
+	struct net		*net;
 	int			protocol;
 	struct sockaddr		*address;
 	size_t			addrsize;
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index dac219a56ae1..83af38df3267 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -177,6 +177,7 @@ static DEFINE_MUTEX(rpcb_create_local_mutex);
 static int rpcb_create_local(void)
 {
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= XPRT_TRANSPORT_TCP,
 		.address	= (struct sockaddr *)&rpcb_inaddr_loopback,
 		.addrsize	= sizeof(rpcb_inaddr_loopback),
@@ -228,6 +229,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 				    size_t salen, int proto, u32 version)
 {
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= proto,
 		.address	= srvaddr,
 		.addrsize	= salen,
-- 
cgit v1.2.3


From 07263f1efe7d5b96e6713471abfa087f41bb2b7c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 31 May 2010 19:09:40 -0400
Subject: nfsd4: minor variable renaming (cb -> conn)

Now that we have both nfsd4_callback and nfsd4_cb_conn structures, I get
confused if variables of both types are always named cb....

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 16 ++++++++--------
 fs/nfsd/nfs4state.c    | 28 ++++++++++++++--------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1112f451295a..4566b69128a3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -472,7 +472,7 @@ static int max_cb_time(void)
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cl_cb_set an atomic? */
 
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
@@ -481,11 +481,11 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 	struct rpc_create_args args = {
 		.net		= &init_net,
 		.protocol	= XPRT_TRANSPORT_TCP,
-		.address	= (struct sockaddr *) &cb->cb_addr,
-		.addrsize	= cb->cb_addrlen,
+		.address	= (struct sockaddr *) &conn->cb_addr,
+		.addrsize	= conn->cb_addrlen,
 		.timeout	= &timeparms,
 		.program	= &cb_program,
-		.prognumber	= cb->cb_prog,
+		.prognumber	= conn->cb_prog,
 		.version	= 0,
 		.authflavor	= clp->cl_flavor,
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -495,8 +495,8 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 
 	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
 		return -EINVAL;
-	if (cb->cb_minorversion) {
-		args.bc_xprt = cb->cb_xprt;
+	if (conn->cb_minorversion) {
+		args.bc_xprt = conn->cb_xprt;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
 	}
 	/* Create RPC client */
@@ -563,13 +563,13 @@ void do_probe_callback(struct nfs4_client *clp)
 /*
  * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
  */
-void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
 	int status;
 
 	BUG_ON(atomic_read(&clp->cl_cb_set));
 
-	status = setup_callback_client(clp, cb);
+	status = setup_callback_client(clp, conn);
 	if (status) {
 		warn_no_callback_path(clp, status);
 		return;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..d347180ce55a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -207,7 +207,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
-	struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
+	struct nfs4_cb_conn *conn = &stp->st_stateowner->so_client->cl_cb_conn;
 
 	dprintk("NFSD alloc_init_deleg\n");
 	/*
@@ -234,7 +234,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	nfs4_file_get_access(fp, O_RDONLY);
 	dp->dl_flock = NULL;
 	dp->dl_type = type;
-	dp->dl_ident = cb->cb_ident;
+	dp->dl_ident = conn->cb_ident;
 	dp->dl_stateid.si_boot = boot_time;
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
@@ -1098,7 +1098,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 {
-	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
+	struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
 	unsigned short expected_family;
 
 	/* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1111,24 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 	else
 		goto out_err;
 
-	cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+	conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
 					    se->se_callback_addr_len,
-					    (struct sockaddr *) &cb->cb_addr,
-					    sizeof(cb->cb_addr));
+					    (struct sockaddr *)&conn->cb_addr,
+					    sizeof(conn->cb_addr));
 
-	if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
+	if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
 		goto out_err;
 
-	if (cb->cb_addr.ss_family == AF_INET6)
-		((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+	if (conn->cb_addr.ss_family == AF_INET6)
+		((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
 
-	cb->cb_minorversion = 0;
-	cb->cb_prog = se->se_callback_prog;
-	cb->cb_ident = se->se_callback_ident;
+	conn->cb_minorversion = 0;
+	conn->cb_prog = se->se_callback_prog;
+	conn->cb_ident = se->se_callback_ident;
 	return;
 out_err:
-	cb->cb_addr.ss_family = AF_UNSPEC;
-	cb->cb_addrlen = 0;
+	conn->cb_addr.ss_family = AF_UNSPEC;
+	conn->cb_addrlen = 0;
 	dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
 		"will not receive delegations\n",
 		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
-- 
cgit v1.2.3


From 586f36735e1d38c32bbfbb2716461e7178724b15 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 26 May 2010 17:40:53 -0400
Subject: nfsd4: combine nfs4_rpc_args and nfsd4_cb_sequence

These two structs don't really need to be distinct as far as I can tell.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 30 +++++++++++++++---------------
 fs/nfsd/state.h        | 11 +++--------
 2 files changed, 18 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4566b69128a3..5687fce85641 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -247,7 +247,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 }
 
 static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+encode_cb_sequence(struct xdr_stream *xdr, struct nfs4_rpc_args *args,
 		   struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 *p;
@@ -258,8 +258,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
 	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
 
 	WRITE32(OP_CB_SEQUENCE);
-	WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(args->cbs_clp->cl_cb_seq_nr);
+	WRITEMEM(args->args_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(args->args_clp->cl_cb_seq_nr);
 	WRITE32(0);		/* slotid, always 0 */
 	WRITE32(0);		/* highest slotid always 0 */
 	WRITE32(0);		/* cachethis always 0 */
@@ -285,12 +285,12 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
 	struct nfs4_delegation *args = rpc_args->args_op;
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = args->dl_ident,
-		.minorversion = rpc_args->args_seq.cbs_minorversion,
+		.minorversion = rpc_args->args_minorversion,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_cb_compound_hdr(&xdr, &hdr);
-	encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
+	encode_cb_sequence(&xdr, rpc_args, &hdr);
 	encode_cb_recall(&xdr, args, &hdr);
 	encode_cb_nops(&hdr);
 	return 0;
@@ -338,7 +338,7 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
  * with a single slot.
  */
 static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+decode_cb_sequence(struct xdr_stream *xdr, struct nfs4_rpc_args *res,
 		   struct rpc_rqst *rqstp)
 {
 	struct nfs4_sessionid id;
@@ -346,7 +346,7 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
 	u32 dummy;
 	__be32 *p;
 
-	if (res->cbs_minorversion == 0)
+	if (res->args_minorversion == 0)
 		return 0;
 
 	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -362,13 +362,13 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
 	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
 	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-	if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+	if (memcmp(id.data, res->args_clp->cl_sessionid.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out;
 	}
 	READ32(dummy);
-	if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+	if (dummy != res->args_clp->cl_cb_seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out;
 	}
@@ -392,7 +392,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-		struct nfsd4_cb_sequence *seq)
+		struct nfs4_rpc_args *args)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
@@ -402,8 +402,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
 	status = decode_cb_compound_hdr(&xdr, &hdr);
 	if (status)
 		goto out;
-	if (seq) {
-		status = decode_cb_sequence(&xdr, seq, rqstp);
+	if (args) {
+		status = decode_cb_sequence(&xdr, args, rqstp);
 		if (status)
 			goto out;
 	}
@@ -603,8 +603,8 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 	 * We'll need the clp during XDR encoding and decoding,
 	 * and the sequence during decoding to verify the reply
 	 */
-	args->args_seq.cbs_clp = clp;
-	task->tk_msg.rpc_resp = &args->args_seq;
+	args->args_clp = clp;
+	task->tk_msg.rpc_resp = args;
 
 out:
 	dprintk("%s status=%d\n", __func__, status);
@@ -623,7 +623,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
 	int status = 0;
 
-	args->args_seq.cbs_minorversion = minorversion;
+	args->args_minorversion = minorversion;
 	if (minorversion) {
 		status = nfsd41_cb_setup_sequence(clp, task);
 		if (status) {
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..59313f1d8e67 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -64,15 +64,10 @@ typedef struct {
 	(s)->si_fileid, \
 	(s)->si_generation
 
-struct nfsd4_cb_sequence {
-	/* args/res */
-	u32			cbs_minorversion;
-	struct nfs4_client	*cbs_clp;
-};
-
 struct nfs4_rpc_args {
-	void				*args_op;
-	struct nfsd4_cb_sequence	args_seq;
+	void			*args_op;
+	struct nfs4_client	*args_clp;
+	u32			args_minorversion;
 };
 
 struct nfsd4_callback {
-- 
cgit v1.2.3


From 1c8556026edac60368ceef446f0febc08014ba78 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 26 May 2010 17:46:00 -0400
Subject: nfsd4: rename nfs4_rpc_args->nfsd4_cb_args

With apologies for the gratuitous rename, the new name seems more
helpful to me.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 14 +++++++-------
 fs/nfsd/state.h        |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5687fce85641..5508e928fd9f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -247,7 +247,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 }
 
 static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfs4_rpc_args *args,
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *args,
 		   struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 *p;
@@ -279,7 +279,7 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-		struct nfs4_rpc_args *rpc_args)
+		struct nfsd4_cb_args *rpc_args)
 {
 	struct xdr_stream xdr;
 	struct nfs4_delegation *args = rpc_args->args_op;
@@ -338,7 +338,7 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
  * with a single slot.
  */
 static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfs4_rpc_args *res,
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *res,
 		   struct rpc_rqst *rqstp)
 {
 	struct nfs4_sessionid id;
@@ -392,7 +392,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-		struct nfs4_rpc_args *args)
+		struct nfsd4_cb_args *args)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
@@ -585,7 +585,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 		struct rpc_task *task)
 {
-	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+	struct nfsd4_cb_args *args = task->tk_msg.rpc_argp;
 	u32 *ptr = (u32 *)clp->cl_sessionid.data;
 	int status = 0;
 
@@ -619,7 +619,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegation *dp = calldata;
 	struct nfs4_client *clp = dp->dl_client;
-	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+	struct nfsd4_cb_args *args = task->tk_msg.rpc_argp;
 	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
 	int status = 0;
 
@@ -756,7 +756,7 @@ static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
 	struct rpc_clnt *clnt = clp->cl_cb_client;
-	struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
+	struct nfsd4_cb_args *args = &dp->dl_recall.cb_args;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_cred = callback_cred
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 59313f1d8e67..f988b90ec213 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -64,14 +64,14 @@ typedef struct {
 	(s)->si_fileid, \
 	(s)->si_generation
 
-struct nfs4_rpc_args {
+struct nfsd4_cb_args {
 	void			*args_op;
 	struct nfs4_client	*args_clp;
 	u32			args_minorversion;
 };
 
 struct nfsd4_callback {
-	struct nfs4_rpc_args cb_args;
+	struct nfsd4_cb_args cb_args;
 	struct work_struct cb_work;
 };
 
-- 
cgit v1.2.3


From 5878453dbde627a8e1b5a4693087e36cb88d45b1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sun, 16 May 2010 16:47:08 -0400
Subject: nfsd4: generic callback code

Make the recall callback code more generic, so that other callbacks
will be able to use it too.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 70 ++++++++++++++++++++++----------------------------
 fs/nfsd/state.h        |  2 ++
 2 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5508e928fd9f..a037f26252ee 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -585,7 +585,6 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 		struct rpc_task *task)
 {
-	struct nfsd4_cb_args *args = task->tk_msg.rpc_argp;
 	u32 *ptr = (u32 *)clp->cl_sessionid.data;
 	int status = 0;
 
@@ -598,14 +597,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 		status = -EAGAIN;
 		goto out;
 	}
-
-	/*
-	 * We'll need the clp during XDR encoding and decoding,
-	 * and the sequence during decoding to verify the reply
-	 */
-	args->args_clp = clp;
-	task->tk_msg.rpc_resp = args;
-
 out:
 	dprintk("%s status=%d\n", __func__, status);
 	return status;
@@ -617,7 +608,8 @@ out:
  */
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 	struct nfsd4_cb_args *args = task->tk_msg.rpc_argp;
 	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
@@ -640,7 +632,8 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 
 	dprintk("%s: minorversion=%d\n", __func__,
@@ -662,7 +655,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
@@ -707,7 +701,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 
 static void nfsd4_cb_recall_release(void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 
 	nfs4_put_delegation(dp);
 }
@@ -749,42 +744,39 @@ void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
 		rpc_shutdown_client(old);
 }
 
-/*
- * called with dp->dl_count inc'ed.
- */
-static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
+void nfsd4_release_cb(struct nfsd4_callback *cb)
 {
-	struct nfs4_client *clp = dp->dl_client;
+	if (cb->cb_ops->rpc_release)
+		cb->cb_ops->rpc_release(cb);
+}
+
+void nfsd4_do_callback_rpc(struct work_struct *w)
+{
+	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
+	struct nfs4_client *clp = cb->cb_args.args_clp;
 	struct rpc_clnt *clnt = clp->cl_cb_client;
-	struct nfsd4_cb_args *args = &dp->dl_recall.cb_args;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-		.rpc_cred = callback_cred
-	};
 
 	if (clnt == NULL) {
-		nfs4_put_delegation(dp);
+		nfsd4_release_cb(cb);
 		return; /* Client is shutting down; give up. */
 	}
-
-	args->args_op = dp;
-	msg.rpc_argp = args;
-	dp->dl_retries = 1;
-	rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
+	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT, cb->cb_ops, cb);
 }
 
-void nfsd4_do_callback_rpc(struct work_struct *w)
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
-	/* XXX: for now, just send off delegation recall. */
-	/* In future, generalize to handle any sort of callback. */
-	struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
-	struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
-
-	_nfsd4_cb_recall(dp);
-}
+	struct nfsd4_callback *cb = &dp->dl_recall;
 
+	dp->dl_retries = 1;
+	cb->cb_args.args_op = dp;
+	cb->cb_args.args_clp = dp->dl_client;
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+	cb->cb_msg.rpc_argp = &cb->cb_args;
+	cb->cb_msg.rpc_resp = &cb->cb_args;
+	cb->cb_msg.rpc_cred = callback_cred;
+
+	cb->cb_ops = &nfsd4_cb_recall_ops;
+	dp->dl_retries = 1;
 
-void nfsd4_cb_recall(struct nfs4_delegation *dp)
-{
 	queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f988b90ec213..6e592148ad80 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -72,6 +72,8 @@ struct nfsd4_cb_args {
 
 struct nfsd4_callback {
 	struct nfsd4_cb_args cb_args;
+	struct rpc_message cb_msg;
+	const struct rpc_call_ops *cb_ops;
 	struct work_struct cb_work;
 };
 
-- 
cgit v1.2.3


From cee277d92495a9ea49a6137fe7005d7c76b31b5b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 26 May 2010 17:52:14 -0400
Subject: nfsd4: use generic callback code in null case

This will eventually allow us, for example, to kick off null callback
from contexts where we can't sleep.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 33 ++++++++++++++++++---------------
 fs/nfsd/nfs4state.c    |  1 +
 fs/nfsd/state.h        |  1 +
 3 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a037f26252ee..26fa878005cc 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -519,7 +519,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_client *clp = calldata;
+	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
 		warn_no_callback_path(clp, task->tk_status);
@@ -528,6 +528,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 }
 
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+	/* XXX: release method to ensure we set the cb channel down if
+	 * necessary on early failure? */
 	.rpc_call_done = nfsd4_cb_probe_done,
 };
 
@@ -543,21 +545,23 @@ int set_callback_cred(void)
 	return 0;
 }
 
+static struct workqueue_struct *callback_wq;
 
 void do_probe_callback(struct nfs4_client *clp)
 {
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-		.rpc_cred	= callback_cred
-	};
-	int status;
+	struct nfsd4_callback *cb = &clp->cl_cb_null;
 
-	status = rpc_call_async(clp->cl_cb_client, &msg,
-				RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
-				&nfsd4_cb_probe_ops, (void *)clp);
-	if (status)
-		warn_no_callback_path(clp, status);
+	cb->cb_args.args_op = NULL;
+	cb->cb_args.args_clp = clp;
+
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
+	cb->cb_msg.rpc_argp = NULL;
+	cb->cb_msg.rpc_resp = NULL;
+	cb->cb_msg.rpc_cred = callback_cred;
+
+	cb->cb_ops = &nfsd4_cb_probe_ops;
+
+	queue_work(callback_wq, &cb->cb_work);
 }
 
 /*
@@ -713,8 +717,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
 	.rpc_release = nfsd4_cb_recall_release,
 };
 
-static struct workqueue_struct *callback_wq;
-
 int nfsd4_create_callback_queue(void)
 {
 	callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -760,7 +762,8 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 		nfsd4_release_cb(cb);
 		return; /* Client is shutting down; give up. */
 	}
-	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT, cb->cb_ops, cb);
+	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+			cb->cb_ops, cb);
 }
 
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d347180ce55a..2f464fb26afc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -978,6 +978,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
 	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6e592148ad80..19732d531cda 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -223,6 +223,7 @@ struct nfs4_client {
 	struct nfs4_cb_conn	cl_cb_conn;
 	struct rpc_clnt		*cl_cb_client;
 	atomic_t		cl_cb_set;
+	struct nfsd4_callback	cl_cb_null;
 
 	/* for nfs41 */
 	struct list_head	cl_sessions;
-- 
cgit v1.2.3


From fb003923263c3f0cb02adbd56a22fe16ef5c0e77 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 31 May 2010 18:21:37 -0400
Subject: nfsd4: remove separate cb_args struct

I don't see the point of the separate struct.  It seems to just be
getting in the way.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 45 ++++++++++++++++++++++-----------------------
 fs/nfsd/state.h        | 10 +++-------
 2 files changed, 25 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 26fa878005cc..07c3be6eea64 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -247,7 +247,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 }
 
 static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *args,
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		   struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 *p;
@@ -258,8 +258,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *args,
 	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
 
 	WRITE32(OP_CB_SEQUENCE);
-	WRITEMEM(args->args_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(args->args_clp->cl_cb_seq_nr);
+	WRITEMEM(cb->cb_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(cb->cb_clp->cl_cb_seq_nr);
 	WRITE32(0);		/* slotid, always 0 */
 	WRITE32(0);		/* highest slotid always 0 */
 	WRITE32(0);		/* cachethis always 0 */
@@ -279,18 +279,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-		struct nfsd4_cb_args *rpc_args)
+		struct nfsd4_callback *cb)
 {
 	struct xdr_stream xdr;
-	struct nfs4_delegation *args = rpc_args->args_op;
+	struct nfs4_delegation *args = cb->cb_op;
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = args->dl_ident,
-		.minorversion = rpc_args->args_minorversion,
+		.minorversion = cb->cb_minorversion,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_cb_compound_hdr(&xdr, &hdr);
-	encode_cb_sequence(&xdr, rpc_args, &hdr);
+	encode_cb_sequence(&xdr, cb, &hdr);
 	encode_cb_recall(&xdr, args, &hdr);
 	encode_cb_nops(&hdr);
 	return 0;
@@ -338,7 +338,7 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
  * with a single slot.
  */
 static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *res,
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		   struct rpc_rqst *rqstp)
 {
 	struct nfs4_sessionid id;
@@ -346,7 +346,7 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *res,
 	u32 dummy;
 	__be32 *p;
 
-	if (res->args_minorversion == 0)
+	if (cb->cb_minorversion == 0)
 		return 0;
 
 	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -362,13 +362,13 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_args *res,
 	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
 	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-	if (memcmp(id.data, res->args_clp->cl_sessionid.data,
+	if (memcmp(id.data, cb->cb_clp->cl_sessionid.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out;
 	}
 	READ32(dummy);
-	if (dummy != res->args_clp->cl_cb_seq_nr) {
+	if (dummy != cb->cb_clp->cl_cb_seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out;
 	}
@@ -392,7 +392,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-		struct nfsd4_cb_args *args)
+		struct nfsd4_callback *cb)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
@@ -402,8 +402,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
 	status = decode_cb_compound_hdr(&xdr, &hdr);
 	if (status)
 		goto out;
-	if (args) {
-		status = decode_cb_sequence(&xdr, args, rqstp);
+	if (cb) {
+		status = decode_cb_sequence(&xdr, cb, rqstp);
 		if (status)
 			goto out;
 	}
@@ -551,8 +551,8 @@ void do_probe_callback(struct nfs4_client *clp)
 {
 	struct nfsd4_callback *cb = &clp->cl_cb_null;
 
-	cb->cb_args.args_op = NULL;
-	cb->cb_args.args_clp = clp;
+	cb->cb_op = NULL;
+	cb->cb_clp = clp;
 
 	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
 	cb->cb_msg.rpc_argp = NULL;
@@ -615,11 +615,10 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
-	struct nfsd4_cb_args *args = task->tk_msg.rpc_argp;
 	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
 	int status = 0;
 
-	args->args_minorversion = minorversion;
+	cb->cb_minorversion = minorversion;
 	if (minorversion) {
 		status = nfsd41_cb_setup_sequence(clp, task);
 		if (status) {
@@ -755,7 +754,7 @@ void nfsd4_release_cb(struct nfsd4_callback *cb)
 void nfsd4_do_callback_rpc(struct work_struct *w)
 {
 	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
-	struct nfs4_client *clp = cb->cb_args.args_clp;
+	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *clnt = clp->cl_cb_client;
 
 	if (clnt == NULL) {
@@ -771,11 +770,11 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
 	struct nfsd4_callback *cb = &dp->dl_recall;
 
 	dp->dl_retries = 1;
-	cb->cb_args.args_op = dp;
-	cb->cb_args.args_clp = dp->dl_client;
+	cb->cb_op = dp;
+	cb->cb_clp = dp->dl_client;
 	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
-	cb->cb_msg.rpc_argp = &cb->cb_args;
-	cb->cb_msg.rpc_resp = &cb->cb_args;
+	cb->cb_msg.rpc_argp = cb;
+	cb->cb_msg.rpc_resp = cb;
 	cb->cb_msg.rpc_cred = callback_cred;
 
 	cb->cb_ops = &nfsd4_cb_recall_ops;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 19732d531cda..2ece6bee65f7 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -64,14 +64,10 @@ typedef struct {
 	(s)->si_fileid, \
 	(s)->si_generation
 
-struct nfsd4_cb_args {
-	void			*args_op;
-	struct nfs4_client	*args_clp;
-	u32			args_minorversion;
-};
-
 struct nfsd4_callback {
-	struct nfsd4_cb_args cb_args;
+	void *cb_op;
+	struct nfs4_client *cb_clp;
+	u32 cb_minorversion;
 	struct rpc_message cb_msg;
 	const struct rpc_call_ops *cb_ops;
 	struct work_struct cb_work;
-- 
cgit v1.2.3


From 6ff8da088766d70f0441feb982b82978a6cbf7ef Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 4 Jun 2010 20:04:45 -0400
Subject: nfsd4: Move callback setup to callback queue

Instead of creating the new rpc client from a regular server thread,
set a flag, kick off a null call, and allow the null call to do the work
of setting up the client on the callback workqueue.

Use a spinlock to ensure the callback work gets a consistent view of the
callback parameters.

This allows, for example, changing the callback from contexts where
sleeping is not allowed.  I hope it will also keep the locking simple as
we add more session and trunking features, by serializing most of the
callback-specific work.

This also closes a small race where the the new cb_ident could be used
with an old connection (or vice-versa).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 73 +++++++++++++++++++++++++++++++++++---------------
 fs/nfsd/nfs4state.c    |  7 +++--
 fs/nfsd/state.h        | 10 +++++--
 3 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 07c3be6eea64..a269dbeff150 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -284,7 +284,7 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
 	struct xdr_stream xdr;
 	struct nfs4_delegation *args = cb->cb_op;
 	struct nfs4_cb_compound_hdr hdr = {
-		.ident = args->dl_ident,
+		.ident = cb->cb_clp->cl_cb_ident,
 		.minorversion = cb->cb_minorversion,
 	};
 
@@ -506,7 +506,8 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 			PTR_ERR(client));
 		return PTR_ERR(client);
 	}
-	nfsd4_set_callback_client(clp, client);
+	clp->cl_cb_ident = conn->cb_ident;
+	clp->cl_cb_client = client;
 	return 0;
 
 }
@@ -569,15 +570,12 @@ void do_probe_callback(struct nfs4_client *clp)
  */
 void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
-	int status;
-
 	BUG_ON(atomic_read(&clp->cl_cb_set));
 
-	status = setup_callback_client(clp, conn);
-	if (status) {
-		warn_no_callback_path(clp, status);
-		return;
-	}
+	spin_lock(&clp->cl_lock);
+	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
+	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	spin_unlock(&clp->cl_lock);
 	do_probe_callback(clp);
 }
 
@@ -730,19 +728,16 @@ void nfsd4_destroy_callback_queue(void)
 }
 
 /* must be called under the state lock */
-void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
 {
-	struct rpc_clnt *old = clp->cl_cb_client;
-
-	clp->cl_cb_client = new;
+	set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
 	/*
-	 * After this, any work that saw the old value of cl_cb_client will
-	 * be gone:
+	 * Note this won't actually result in a null callback;
+	 * instead, nfsd4_do_callback_rpc() will detect the killed
+	 * client, destroy the rpc client, and stop:
 	 */
+	do_probe_callback(clp);
 	flush_workqueue(callback_wq);
-	/* So we can safely shut it down: */
-	if (old)
-		rpc_shutdown_client(old);
 }
 
 void nfsd4_release_cb(struct nfsd4_callback *cb)
@@ -751,15 +746,51 @@ void nfsd4_release_cb(struct nfsd4_callback *cb)
 		cb->cb_ops->rpc_release(cb);
 }
 
+void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_conn conn;
+	struct nfs4_client *clp = cb->cb_clp;
+	int err;
+
+	/*
+	 * This is either an update, or the client dying; in either case,
+	 * kill the old client:
+	 */
+	if (clp->cl_cb_client) {
+		rpc_shutdown_client(clp->cl_cb_client);
+		clp->cl_cb_client = NULL;
+	}
+	if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+		return;
+	spin_lock(&clp->cl_lock);
+	/*
+	 * Only serialized callback code is allowed to clear these
+	 * flags; main nfsd code can only set them:
+	 */
+	BUG_ON(!clp->cl_cb_flags);
+	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+	spin_unlock(&clp->cl_lock);
+
+	err = setup_callback_client(clp, &conn);
+	if (err)
+		warn_no_callback_path(clp, err);
+}
+
 void nfsd4_do_callback_rpc(struct work_struct *w)
 {
 	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
 	struct nfs4_client *clp = cb->cb_clp;
-	struct rpc_clnt *clnt = clp->cl_cb_client;
+	struct rpc_clnt *clnt;
 
-	if (clnt == NULL) {
+	if (clp->cl_cb_flags)
+		nfsd4_process_cb_update(cb);
+
+	clnt = clp->cl_cb_client;
+	if (!clnt) {
+		/* Callback channel broken, or client killed; give up: */
 		nfsd4_release_cb(cb);
-		return; /* Client is shutting down; give up. */
+		return;
 	}
 	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
 			cb->cb_ops, cb);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2f464fb26afc..d3f12dcc1696 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
-	struct nfs4_cb_conn *conn = &stp->st_stateowner->so_client->cl_cb_conn;
 
 	dprintk("NFSD alloc_init_deleg\n");
 	/*
@@ -234,7 +233,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	nfs4_file_get_access(fp, O_RDONLY);
 	dp->dl_flock = NULL;
 	dp->dl_type = type;
-	dp->dl_ident = conn->cb_ident;
 	dp->dl_stateid.si_boot = boot_time;
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
@@ -875,7 +873,7 @@ expire_client(struct nfs4_client *clp)
 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
 		release_openowner(sop);
 	}
-	nfsd4_set_callback_client(clp, NULL);
+	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
 	list_del(&clp->cl_idhash);
@@ -978,6 +976,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	spin_lock_init(&clp->cl_lock);
 	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
 	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
@@ -1547,7 +1546,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
 
 	nfs4_lock_state();
 	/* wait for callbacks */
-	nfsd4_set_callback_client(ses->se_client, NULL);
+	nfsd4_shutdown_callback(ses->se_client);
 	nfs4_unlock_state();
 	nfsd4_put_session(ses);
 	status = nfs_ok;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 2ece6bee65f7..58bc2a63ca14 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -84,7 +84,6 @@ struct nfs4_delegation {
 	u32			dl_type;
 	time_t			dl_time;
 /* For recall: */
-	u32			dl_ident;
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
 	int			dl_retries;
@@ -217,10 +216,17 @@ struct nfs4_client {
 
 	/* for v4.0 and v4.1 callbacks: */
 	struct nfs4_cb_conn	cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE	1
+#define NFSD4_CLIENT_KILL	2
+	unsigned long		cl_cb_flags;
 	struct rpc_clnt		*cl_cb_client;
+	u32			cl_cb_ident;
 	atomic_t		cl_cb_set;
 	struct nfsd4_callback	cl_cb_null;
 
+	/* for all client information that callback code might need: */
+	spinlock_t		cl_lock;
+
 	/* for nfs41 */
 	struct list_head	cl_sessions;
 	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
@@ -439,7 +445,7 @@ extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
-extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
-- 
cgit v1.2.3


From c23753dac1d21b39facd2ad3c7340dd275b3022f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 27 Sep 2010 16:22:30 -0400
Subject: nfsd4: fix alloc_init_session BUILD_BUG_ON()

Note we're allocating an array of nfsd4_slot *'s, not nfsd4_slot's.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d3f12dcc1696..e30579432863 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -657,7 +657,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 	if (status)
 		goto out;
 
-	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
 		     + sizeof(struct nfsd4_session) > PAGE_SIZE);
 
 	status = nfserr_jukebox;
-- 
cgit v1.2.3


From dd93842457174b847b023314e5a501e5ed45caeb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 27 Sep 2010 16:26:25 -0400
Subject: nfsd4: fix alloc_init_session return type

This returns an nfs error, not -ERRNO.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e30579432863..ebddcc173ed8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -639,9 +639,7 @@ static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
 	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
 }
 
-static int
-alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
-		   struct nfsd4_create_session *cses)
+static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
 	struct nfsd4_session *new, tmp;
 	struct nfsd4_slot *sp;
-- 
cgit v1.2.3


From 5b6feee9608dce7afd2646f457c93e612526d1d8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 27 Sep 2010 17:12:05 -0400
Subject: nfsd4: clean up session allocation

Changes:
	- make sure session memory reservation is released on failure
	  path.
	- use min_t()/min() for more compact code in several places.
	- break alloc_init_session into smaller pieces.
	- miscellaneous other cleanup.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 211 ++++++++++++++++++++++------------------------------
 1 file changed, 89 insertions(+), 122 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ebddcc173ed8..f86476c23b2f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -533,94 +533,6 @@ gen_sessionid(struct nfsd4_session *ses)
  */
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
 
-/*
- * Give the client the number of ca_maxresponsesize_cached slots it
- * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
- * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
- * than NFSD_MAX_SLOTS_PER_SESSION.
- *
- * If we run out of reserved DRC memory we should (up to a point)
- * re-negotiate active sessions and reduce their slot usage to make
- * rooom for new connections. For now we just fail the create session.
- */
-static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
-{
-	int mem, size = fchan->maxresp_cached;
-
-	if (fchan->maxreqs < 1)
-		return nfserr_inval;
-
-	if (size < NFSD_MIN_HDR_SEQ_SZ)
-		size = NFSD_MIN_HDR_SEQ_SZ;
-	size -= NFSD_MIN_HDR_SEQ_SZ;
-	if (size > NFSD_SLOT_CACHE_SIZE)
-		size = NFSD_SLOT_CACHE_SIZE;
-
-	/* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
-	mem = fchan->maxreqs * size;
-	if (mem > NFSD_MAX_MEM_PER_SESSION) {
-		fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
-		if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
-			fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-		mem = fchan->maxreqs * size;
-	}
-
-	spin_lock(&nfsd_drc_lock);
-	/* bound the total session drc memory ussage */
-	if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
-		fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
-		mem = fchan->maxreqs * size;
-	}
-	nfsd_drc_mem_used += mem;
-	spin_unlock(&nfsd_drc_lock);
-
-	if (fchan->maxreqs == 0)
-		return nfserr_jukebox;
-
-	fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
-	return 0;
-}
-
-/*
- * fchan holds the client values on input, and the server values on output
- * sv_max_mesg is the maximum payload plus one page for overhead.
- */
-static int init_forechannel_attrs(struct svc_rqst *rqstp,
-				  struct nfsd4_channel_attrs *session_fchan,
-				  struct nfsd4_channel_attrs *fchan)
-{
-	int status = 0;
-	__u32   maxcount = nfsd_serv->sv_max_mesg;
-
-	/* headerpadsz set to zero in encode routine */
-
-	/* Use the client's max request and max response size if possible */
-	if (fchan->maxreq_sz > maxcount)
-		fchan->maxreq_sz = maxcount;
-	session_fchan->maxreq_sz = fchan->maxreq_sz;
-
-	if (fchan->maxresp_sz > maxcount)
-		fchan->maxresp_sz = maxcount;
-	session_fchan->maxresp_sz = fchan->maxresp_sz;
-
-	/* Use the client's maxops if possible */
-	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
-		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
-	session_fchan->maxops = fchan->maxops;
-
-	/* FIXME: Error means no more DRC pages so the server should
-	 * recover pages from existing sessions. For now fail session
-	 * creation.
-	 */
-	status = set_forechannel_drc_size(fchan);
-
-	session_fchan->maxresp_cached = fchan->maxresp_cached;
-	session_fchan->maxreqs = fchan->maxreqs;
-
-	dprintk("%s status %d\n", __func__, status);
-	return status;
-}
-
 static void
 free_session_slots(struct nfsd4_session *ses)
 {
@@ -639,63 +551,118 @@ static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
 	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
 }
 
-static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+static int nfsd4_sanitize_slot_size(u32 size)
 {
-	struct nfsd4_session *new, tmp;
-	struct nfsd4_slot *sp;
-	int idx, slotsize, cachesize, i;
-	int status;
+	size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
+	size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
 
-	memset(&tmp, 0, sizeof(tmp));
+	return size;
+}
 
-	/* FIXME: For now, we just accept the client back channel attributes. */
-	tmp.se_bchannel = cses->back_channel;
-	status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
-					&cses->fore_channel);
-	if (status)
-		goto out;
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * rooom for new connections. For now we just fail the create session.
+ */
+static int nfsd4_get_drc_mem(int slotsize, u32 num)
+{
+	int avail;
 
-	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
-		     + sizeof(struct nfsd4_session) > PAGE_SIZE);
+	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
 
-	status = nfserr_jukebox;
-	/* allocate struct nfsd4_session and slot table pointers in one piece */
-	slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
-	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
-	if (!new)
-		goto out;
+	spin_lock(&nfsd_drc_lock);
+	avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
+			nfsd_drc_max_mem - nfsd_drc_mem_used);
+	num = min_t(int, num, avail / slotsize);
+	nfsd_drc_mem_used += num * slotsize;
+	spin_unlock(&nfsd_drc_lock);
+
+	return num;
+}
+
+static void nfsd4_put_drc_mem(int slotsize, int num)
+{
+	spin_lock(&nfsd_drc_lock);
+	nfsd_drc_mem_used -= slotsize * num;
+	spin_unlock(&nfsd_drc_lock);
+}
+
+static struct nfsd4_session *alloc_session(int slotsize, int numslots)
+{
+	struct nfsd4_session *new;
+	int mem, i;
 
-	memcpy(new, &tmp, sizeof(*new));
+	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
+			+ sizeof(struct nfsd4_session) > PAGE_SIZE);
+	mem = numslots * sizeof(struct nfsd4_slot *);
 
+	new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+	if (!new)
+		return NULL;
 	/* allocate each struct nfsd4_slot and data cache in one piece */
-	cachesize = slot_bytes(&new->se_fchannel);
-	for (i = 0; i < new->se_fchannel.maxreqs; i++) {
-		sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
-		if (!sp)
+	for (i = 0; i < numslots; i++) {
+		mem = sizeof(struct nfsd4_slot) + slotsize;
+		new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+		if (!new->se_slots[i])
 			goto out_free;
-		new->se_slots[i] = sp;
 	}
+	return new;
+out_free:
+	while (i--)
+		kfree(new->se_slots[i]);
+	kfree(new);
+	return NULL;
+}
+
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
+{
+	u32 maxrpc = nfsd_serv->sv_max_mesg;
+
+	new->maxreqs = numslots;
+	new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+	new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
+	new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
+	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+}
+
+static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new;
+	struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
+	int numslots, slotsize;
+	int idx;
+
+	/*
+	 * Note decreasing slot size below client's request may
+	 * make it difficult for client to function correctly, whereas
+	 * decreasing the number of slots will (just?) affect
+	 * performance.  When short on memory we therefore prefer to
+	 * decrease number of slots instead of their size.
+	 */
+	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
+	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+
+	new = alloc_session(slotsize, numslots);
+	if (!new) {
+		nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+		return nfserr_jukebox;
+	}
+	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
 
 	new->se_client = clp;
 	gen_sessionid(new);
-	idx = hash_sessionid(&new->se_sessionid);
 	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
 
 	new->se_flags = cses->flags;
 	kref_init(&new->se_ref);
+	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&client_lock);
 	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&client_lock);
 
-	status = nfs_ok;
-out:
-	return status;
-out_free:
-	free_session_slots(new);
-	kfree(new);
-	goto out;
+	return nfs_ok;
 }
 
 /* caller must hold client_lock */
-- 
cgit v1.2.3


From c7662518c781edc8059cd9737d18168154bf7510 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sun, 6 Jun 2010 18:12:14 -0400
Subject: nfsd4: keep per-session list of connections

The spec requires us in various places to keep track of the connections
associated with each session.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c  | 69 ++++++++++++++++++++++++++++++++++++++++------------
 fs/nfsd/state.h      |  8 ++++++
 include/linux/nfs4.h |  3 +++
 3 files changed, 65 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f86476c23b2f..c7c1a7afa197 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -625,11 +625,58 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
 	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
 }
 
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_conn *conn;
+
+	conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
+	if (!conn)
+		return nfserr_jukebox;
+	conn->cn_flags = NFS4_CDFC4_FORE;
+	svc_xprt_get(rqstp->rq_xprt);
+	conn->cn_xprt = rqstp->rq_xprt;
+
+	spin_lock(&clp->cl_lock);
+	list_add(&conn->cn_persession, &ses->se_conns);
+	spin_unlock(&clp->cl_lock);
+
+	return nfs_ok;
+}
+
+static void free_conn(struct nfsd4_conn *c)
+{
+	svc_xprt_put(c->cn_xprt);
+	kfree(c);
+}
+
+void free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+	int mem;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	while (!list_empty(&ses->se_conns)) {
+		struct nfsd4_conn *c;
+		c = list_first_entry(&ses->se_conns, struct nfsd4_conn, cn_persession);
+		list_del(&c->cn_persession);
+		free_conn(c);
+	}
+	spin_lock(&nfsd_drc_lock);
+	mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+	nfsd_drc_mem_used -= mem;
+	spin_unlock(&nfsd_drc_lock);
+	free_session_slots(ses);
+	kfree(ses);
+}
+
+
 static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
 	struct nfsd4_session *new;
 	struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
 	int numslots, slotsize;
+	int status;
 	int idx;
 
 	/*
@@ -654,6 +701,8 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
 
+	INIT_LIST_HEAD(&new->se_conns);
+
 	new->se_flags = cses->flags;
 	kref_init(&new->se_ref);
 	idx = hash_sessionid(&new->se_sessionid);
@@ -662,6 +711,11 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&client_lock);
 
+	status = nfsd4_new_conn(rqstp, new);
+	if (status) {
+		free_session(&new->se_ref);
+		return nfserr_jukebox;
+	}
 	return nfs_ok;
 }
 
@@ -694,21 +748,6 @@ unhash_session(struct nfsd4_session *ses)
 	list_del(&ses->se_perclnt);
 }
 
-void
-free_session(struct kref *kref)
-{
-	struct nfsd4_session *ses;
-	int mem;
-
-	ses = container_of(kref, struct nfsd4_session, se_ref);
-	spin_lock(&nfsd_drc_lock);
-	mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
-	nfsd_drc_mem_used -= mem;
-	spin_unlock(&nfsd_drc_lock);
-	free_session_slots(ses);
-	kfree(ses);
-}
-
 /* must be called under the client_lock */
 static inline void
 renew_client_locked(struct nfs4_client *clp)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 58bc2a63ca14..29413c2ed270 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -152,6 +152,13 @@ struct nfsd4_clid_slot {
 	struct nfsd4_create_session	sl_cr_ses;
 };
 
+struct nfsd4_conn {
+	struct list_head cn_persession;
+	struct svc_xprt *cn_xprt;
+/* CDFC4_FORE, CDFC4_BACK: */
+	unsigned char cn_flags;
+};
+
 struct nfsd4_session {
 	struct kref		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
@@ -161,6 +168,7 @@ struct nfsd4_session {
 	struct nfs4_sessionid	se_sessionid;
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_channel_attrs se_bchannel;
+	struct list_head	se_conns;
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 07e40c625972..79b15fb2f304 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -61,6 +61,9 @@
 #define NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL	0x10000
 #define NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED		0x20000
 
+#define NFS4_CDFC4_FORE	0x1
+#define NFS4_CDFC4_BACK 0x2
+
 #define NFS4_SET_TO_SERVER_TIME	0
 #define NFS4_SET_TO_CLIENT_TIME	1
 
-- 
cgit v1.2.3


From 19cf5c026f3ee06027523e59478e3fa54f573e5e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sun, 6 Jun 2010 18:37:16 -0400
Subject: nfsd4: use callbacks on svc_xprt_deletion

Remove connections from the list when they go down.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 fs/nfsd/state.h     |  3 +++
 2 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c7c1a7afa197..b7e9793b58f5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -625,6 +625,25 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
 	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
 }
 
+static void free_conn(struct nfsd4_conn *c)
+{
+	svc_xprt_put(c->cn_xprt);
+	kfree(c);
+}
+
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
+{
+	struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+	struct nfs4_client *clp = c->cn_session->se_client;
+
+	spin_lock(&clp->cl_lock);
+	if (!list_empty(&c->cn_persession)) {
+		list_del(&c->cn_persession);
+		free_conn(c);
+	}
+	spin_unlock(&clp->cl_lock);
+}
+
 static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 {
 	struct nfs4_client *clp = ses->se_client;
@@ -636,18 +655,34 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 	conn->cn_flags = NFS4_CDFC4_FORE;
 	svc_xprt_get(rqstp->rq_xprt);
 	conn->cn_xprt = rqstp->rq_xprt;
+	conn->cn_session = ses;
 
 	spin_lock(&clp->cl_lock);
 	list_add(&conn->cn_persession, &ses->se_conns);
 	spin_unlock(&clp->cl_lock);
 
+	conn->cn_xpt_user.callback = nfsd4_conn_lost;
+	register_xpt_user(rqstp->rq_xprt, &conn->cn_xpt_user);
 	return nfs_ok;
 }
 
-static void free_conn(struct nfsd4_conn *c)
+static void nfsd4_del_conns(struct nfsd4_session *s)
 {
-	svc_xprt_put(c->cn_xprt);
-	kfree(c);
+	struct nfs4_client *clp = s->se_client;
+	struct nfsd4_conn *c;
+
+	spin_lock(&clp->cl_lock);
+	while (!list_empty(&s->se_conns)) {
+		c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+		list_del_init(&c->cn_persession);
+		spin_unlock(&clp->cl_lock);
+
+		unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
+		free_conn(c);
+
+		spin_lock(&clp->cl_lock);
+	}
+	spin_unlock(&clp->cl_lock);
 }
 
 void free_session(struct kref *kref)
@@ -656,12 +691,7 @@ void free_session(struct kref *kref)
 	int mem;
 
 	ses = container_of(kref, struct nfsd4_session, se_ref);
-	while (!list_empty(&ses->se_conns)) {
-		struct nfsd4_conn *c;
-		c = list_first_entry(&ses->se_conns, struct nfsd4_conn, cn_persession);
-		list_del(&c->cn_persession);
-		free_conn(c);
-	}
+	nfsd4_del_conns(ses);
 	spin_lock(&nfsd_drc_lock);
 	mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
 	nfsd_drc_mem_used -= mem;
@@ -1552,6 +1582,9 @@ nfsd4_destroy_session(struct svc_rqst *r,
 	/* wait for callbacks */
 	nfsd4_shutdown_callback(ses->se_client);
 	nfs4_unlock_state();
+
+	nfsd4_del_conns(ses);
+
 	nfsd4_put_session(ses);
 	status = nfs_ok;
 out:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 29413c2ed270..8d5e2370cce0 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
 
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
 
@@ -155,6 +156,8 @@ struct nfsd4_clid_slot {
 struct nfsd4_conn {
 	struct list_head cn_persession;
 	struct svc_xprt *cn_xprt;
+	struct svc_xpt_user cn_xpt_user;
+	struct nfsd4_session *cn_session;
 /* CDFC4_FORE, CDFC4_BACK: */
 	unsigned char cn_flags;
 };
-- 
cgit v1.2.3


From db90681d6eff89efc1eee523e1cb77eb632a6cf7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 29 Sep 2010 15:29:32 -0400
Subject: nfsd4: refactor connection allocation

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b7e9793b58f5..3b4d74cbb6c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -644,25 +644,45 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 	spin_unlock(&clp->cl_lock);
 }
 
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp)
 {
-	struct nfs4_client *clp = ses->se_client;
 	struct nfsd4_conn *conn;
 
 	conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
 	if (!conn)
-		return nfserr_jukebox;
-	conn->cn_flags = NFS4_CDFC4_FORE;
+		return NULL;
 	svc_xprt_get(rqstp->rq_xprt);
 	conn->cn_xprt = rqstp->rq_xprt;
-	conn->cn_session = ses;
+	conn->cn_flags = NFS4_CDFC4_FORE;
+	INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+	return conn;
+}
+
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
 
 	spin_lock(&clp->cl_lock);
+	conn->cn_session = ses;
 	list_add(&conn->cn_persession, &ses->se_conns);
 	spin_unlock(&clp->cl_lock);
+}
 
+static void nfsd4_register_conn(struct nfsd4_conn *conn)
+{
 	conn->cn_xpt_user.callback = nfsd4_conn_lost;
-	register_xpt_user(rqstp->rq_xprt, &conn->cn_xpt_user);
+	register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+}
+
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	struct nfsd4_conn *conn;
+
+	conn = alloc_conn(rqstp);
+	if (!conn)
+		return nfserr_jukebox;
+	nfsd4_hash_conn(conn, ses);
+	nfsd4_register_conn(conn);
 	return nfs_ok;
 }
 
-- 
cgit v1.2.3


From 328ead287220711c3ad4490b1f3f691855df4039 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 29 Sep 2010 16:11:06 -0400
Subject: nfsd4: add new connections to session

As long as we're not implementing any session security, we should just
automatically add any new connections that come along to the list of
sessions associated with the session.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3b4d74cbb6c8..596702e157c9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -658,13 +658,18 @@ static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp)
 	return conn;
 }
 
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	conn->cn_session = ses;
+	list_add(&conn->cn_persession, &ses->se_conns);
+}
+
 static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
 {
 	struct nfs4_client *clp = ses->se_client;
 
 	spin_lock(&clp->cl_lock);
-	conn->cn_session = ses;
-	list_add(&conn->cn_persession, &ses->se_conns);
+	__nfsd4_hash_conn(conn, ses);
 	spin_unlock(&clp->cl_lock);
 }
 
@@ -1612,6 +1617,44 @@ out:
 	return status;
 }
 
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_rqst *r, struct nfsd4_session *s)
+{
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(c, &s->se_conns, cn_persession) {
+		if (c->cn_xprt == r->rq_xprt) {
+			return c;
+		}
+	}
+	return NULL;
+}
+
+static void nfsd4_sequence_check_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_conn *c, *new = NULL;
+
+	spin_lock(&clp->cl_lock);
+	c = __nfsd4_find_conn(rqstp, ses);
+	spin_unlock(&clp->cl_lock);
+	if (c)
+		return;
+
+	new = alloc_conn(rqstp);
+
+	spin_lock(&clp->cl_lock);
+	c = __nfsd4_find_conn(rqstp, ses);
+	if (c) {
+		spin_unlock(&clp->cl_lock);
+		free_conn(new);
+		return;
+	}
+	__nfsd4_hash_conn(new, ses);
+	spin_unlock(&clp->cl_lock);
+	nfsd4_register_conn(new);
+	return;
+}
+
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
 	       struct nfsd4_compound_state *cstate,
@@ -1656,6 +1699,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status)
 		goto out;
 
+	nfsd4_sequence_check_conn(rqstp, session);
+
 	/* Success! bump slot seqid */
 	slot->sl_inuse = true;
 	slot->sl_seqid = seq->seqid;
-- 
cgit v1.2.3


From 33515142156efc9ab5dbfe93ff8d4765559dc987 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 2 Oct 2010 18:42:39 -0400
Subject: nfsd4: return expired on unfound stateid's

Commit 78155ed75f470710f2aecb3e75e3d97107ba8374 "nfsd4: distinguish
expired from stale stateids" attempted to distinguish expired and stale
stateid's using time information that may not have been completely
reliable, so I reverted it.

That was throwing out the baby with the bathwater; we still do want to
return expired, but let's do that using the simpler approach of just
assuming any stateid is expired if it looks like it was given out by the
current server instance, but we can't find it any more.

This may help clients that are recovering from network partitions.

Reported-by: Bian Naimeng <biannm@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 596702e157c9..02c23b7c5cd5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3046,7 +3046,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	if (STALE_STATEID(stateid)) 
 		goto out;
 
-	status = nfserr_bad_stateid;
+	/*
+	 * We assume that any stateid that has the current boot time,
+	 * but that we can't find, is expired:
+	 */
+	status = nfserr_expired;
 	if (is_delegation_stateid(stateid)) {
 		dp = find_delegation_stateid(ino, stateid);
 		if (!dp)
@@ -3066,6 +3070,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		stp = find_stateid(stateid, flags);
 		if (!stp)
 			goto out;
+		status = nfserr_bad_stateid;
 		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
@@ -3140,8 +3145,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		 * a replayed close:
 		 */
 		sop = search_close_lru(stateid->si_stateownerid, flags);
+		/* It's not stale; let's assume it's expired: */
 		if (sop == NULL)
-			return nfserr_bad_stateid;
+			return nfserr_expired;
 		*sopp = sop;
 		goto check_replay;
 	}
@@ -3406,6 +3412,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfserr_bad_stateid;
 	if (!is_delegation_stateid(stateid))
 		goto out;
+	status = nfserr_expired;
 	dp = find_delegation_stateid(inode, stateid);
 	if (!dp)
 		goto out;
-- 
cgit v1.2.3


From db71922217a214e5c9268448e537b54fc1f301ea Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Sun, 15 Aug 2010 22:51:10 +0200
Subject: BKL: Explicitly add BKL around get_sb/fill_super

This patch is a preparation necessary to remove the BKL from do_new_mount().
It explicitly adds calls to lock_kernel()/unlock_kernel() around
get_sb/fill_super operations for filesystems that still uses the BKL.

I've read through all the code formerly covered by the BKL inside
do_kern_mount() and have satisfied myself that it doesn't need the BKL
any more.

do_kern_mount() is already called without the BKL when mounting the rootfs
and in nfsctl. do_kern_mount() calls vfs_kern_mount(), which is called
from various places without BKL: simple_pin_fs(), nfs_do_clone_mount()
through nfs_follow_mountpoint(), afs_mntpt_do_automount() through
afs_mntpt_follow_link(). Both later functions are actually the filesystems
follow_link inode operation. vfs_kern_mount() is calling the specified
get_sb function and lets the filesystem do its job by calling the given
fill_super function.

Therefore I think it is safe to push down the BKL from the VFS to the
low-level filesystems get_sb/fill_super operation.

[arnd: do not add the BKL to those file systems that already
       don't use it elsewhere]

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/adfs/super.c          |  8 +++++++-
 fs/affs/super.c          |  9 ++++++++-
 fs/afs/super.c           |  5 +++++
 fs/bfs/inode.c           |  8 +++++++-
 fs/cifs/cifsfs.c         | 12 ++++++++++--
 fs/coda/inode.c          |  8 +++++++-
 fs/ecryptfs/main.c       |  4 ++++
 fs/ext2/super.c          | 10 ++++++++--
 fs/ext3/super.c          |  9 ++++++++-
 fs/ext4/super.c          |  8 ++++++--
 fs/fat/namei_msdos.c     |  7 ++++++-
 fs/fat/namei_vfat.c      |  7 ++++++-
 fs/freevxfs/vxfs_super.c |  7 ++++++-
 fs/hfs/super.c           |  8 +++++++-
 fs/hpfs/super.c          |  8 +++++++-
 fs/isofs/inode.c         |  8 +++++++-
 fs/jffs2/super.c         | 11 +++++++++--
 fs/jfs/super.c           | 12 ++++++++++--
 fs/nilfs2/super.c        |  8 +++++++-
 fs/ntfs/super.c          |  5 +++++
 fs/ocfs2/dlmfs/dlmfs.c   |  9 ++++++++-
 fs/ocfs2/super.c         |  5 +++++
 fs/qnx4/inode.c          |  8 +++++++-
 fs/smbfs/inode.c         |  5 +++++
 fs/squashfs/super.c      |  6 ++++++
 fs/udf/super.c           |  8 +++++++-
 fs/ufs/super.c           |  5 +++++
 kernel/cgroup.c          |  4 ++++
 28 files changed, 187 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1d..d9803f73236f 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -352,11 +352,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct adfs_sb_info *asb;
 	struct inode *root;
 
+	lock_kernel();
+
 	sb->s_flags |= MS_NODIRATIME;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-	if (!asb)
+	if (!asb) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	sb->s_fs_info = asb;
 
 	/* set default options */
@@ -474,6 +478,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto error;
 	} else
 		sb->s_root->d_op = &adfs_dentry_operations;
+	unlock_kernel();
 	return 0;
 
 error_free_bh:
@@ -481,6 +486,7 @@ error_free_bh:
 error:
 	sb->s_fs_info = NULL;
 	kfree(asb);
+	unlock_kernel();
 	return -EINVAL;
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef470..3a6d1dee4a51 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -291,6 +291,8 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	u8			 sig[4];
 	int			 ret = -EINVAL;
 
+	lock_kernel();
+
 	save_mount_options(sb, data);
 
 	pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
@@ -300,8 +302,10 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= MS_NODIRATIME;
 
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	mutex_init(&sbi->s_bmlock);
 	spin_lock_init(&sbi->symlink_lock);
@@ -312,6 +316,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "AFFS: Error parsing options\n");
 		kfree(sbi->s_prefix);
 		kfree(sbi);
+		unlock_kernel();
 		return -EINVAL;
 	}
 	/* N.B. after this point s_prefix must be released */
@@ -482,6 +487,7 @@ got_root:
 	sb->s_root->d_op = &affs_dentry_operations;
 
 	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
+	unlock_kernel();
 	return 0;
 
 	/*
@@ -496,6 +502,7 @@ out_error_noinode:
 	kfree(sbi->s_prefix);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 77e1e5a61154..6c2fef44d385 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -302,12 +302,15 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	struct inode *inode = NULL;
 	int ret;
 
+	lock_kernel();
+
 	_enter("");
 
 	/* allocate a superblock info record */
 	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
 	if (!as) {
 		_leave(" = -ENOMEM");
+		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -341,6 +344,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	sb->s_root = root;
 
 	_leave(" = 0");
+	unlock_kernel();
 	return 0;
 
 error_inode:
@@ -354,6 +358,7 @@ error:
 	sb->s_fs_info = NULL;
 
 	_leave(" = %d", ret);
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c4daf0f5fc02..d2e09363dd93 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -322,9 +322,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	int ret = -EINVAL;
 	unsigned long i_sblock, i_eblock, i_eoff, s_size;
 
+	lock_kernel();
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
+	if (!info) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	mutex_init(&info->bfs_lock);
 	s->s_fs_info = info;
 
@@ -439,6 +443,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	brelse(bh);
 	brelse(sbh);
 	dump_imap("read_super", s);
+	unlock_kernel();
 	return 0;
 
 out3:
@@ -452,6 +457,7 @@ out:
 	mutex_destroy(&info->bfs_lock);
 	kfree(info);
 	s->s_fs_info = NULL;
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..070bf1aecd2d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -514,22 +514,30 @@ cifs_get_sb(struct file_system_type *fs_type,
 	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int rc;
-	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
+	struct super_block *sb;
+
+	lock_kernel();
+
+	sb = sget(fs_type, NULL, set_anon_super, NULL);
 
 	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
 
-	if (IS_ERR(sb))
+	if (IS_ERR(sb)) {
+		unlock_kernel();
 		return PTR_ERR(sb);
+	}
 
 	sb->s_flags = flags;
 
 	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
 	if (rc) {
 		deactivate_locked_super(sb);
+		unlock_kernel();
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
 	simple_set_mnt(mnt, sb);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6526e6f21ecf..bfe8179b1295 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -148,6 +148,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	int error;
 	int idx;
 
+	lock_kernel();
+
 	idx = get_device_index((struct coda_mount_data *) data);
 
 	/* Ignore errors in data, for backward compatibility */
@@ -159,11 +161,13 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	vc = &coda_comms[idx];
 	if (!vc->vc_inuse) {
 		printk("coda_read_super: No pseudo device\n");
+		unlock_kernel();
 		return -EINVAL;
 	}
 
         if ( vc->vc_sb ) {
 		printk("coda_read_super: Device already mounted\n");
+		unlock_kernel();
 		return -EBUSY;
 	}
 
@@ -202,7 +206,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root)
 		goto error;
-        return 0;
+	unlock_kernel();
+	return 0;
 
  error:
 	bdi_destroy(&vc->bdi);
@@ -212,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	if (vc)
 		vc->vc_sb = NULL;
 
+	unlock_kernel();
 	return -EINVAL;
 }
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..c4af92fa12c3 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>     /* For lock_kernel() */
 #include "ecryptfs_kernel.h"
 
 /**
@@ -550,6 +551,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 	const char *err = "Getting sb failed";
 	int rc;
 
+	lock_kernel();
 	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
 	if (!sbi) {
 		rc = -ENOMEM;
@@ -608,6 +610,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto out;
 	}
 	simple_set_mnt(mnt, s);
+	unlock_kernel();
 	return 0;
 
 out:
@@ -616,6 +619,7 @@ out:
 		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
 	}
 	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
+	unlock_kernel();
 	return rc;
 }
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1ec602673ea8..f98c390caf1d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -747,15 +747,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	__le32 features;
 	int err;
 
+	lock_kernel();
+
+	err = -ENOMEM;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
+		goto failed_unlock;
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 	if (!sbi->s_blockgroup_lock) {
 		kfree(sbi);
-		return -ENOMEM;
+		goto failed_unlock;
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
@@ -1083,6 +1086,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
 		sb->s_flags |= MS_RDONLY;
 	ext2_write_super(sb);
+	unlock_kernel();
 	return 0;
 
 cantfind_ext2:
@@ -1107,6 +1111,8 @@ failed_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
+failed_unlock:
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4dba03c4..41f9dcd73e97 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1611,14 +1611,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	__le32 features;
 	int err;
 
+	lock_kernel();
+
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 	if (!sbi->s_blockgroup_lock) {
 		kfree(sbi);
+		unlock_kernel();
 		return -ENOMEM;
 	}
 	sb->s_fs_info = sbi;
@@ -2026,6 +2031,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		"writeback");
 
 	lock_kernel();
+	unlock_kernel();
 	return 0;
 
 cantfind_ext3:
@@ -2056,6 +2062,7 @@ out_fail:
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..0f0021f4990c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2568,6 +2568,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 
+	lock_kernel();
+
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		goto out_free_orig;
@@ -3166,7 +3168,6 @@ no_journal:
 	if (es->s_error_count)
 		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
 
-	lock_kernel();
 	kfree(orig_data);
 	return 0;
 
@@ -3213,8 +3214,11 @@ out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	lock_kernel();
+	kfree(orig_data);
+	return ret;
+
 out_free_orig:
+	unlock_kernel();
 	kfree(orig_data);
 	return ret;
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd77..e2b0b978340d 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
+#include <linux/smp_lock.h>     /* For lock_kernel() */
 #include "fat.h"
 
 /* Characters that are undesirable in an MS-DOS file name */
@@ -662,12 +663,16 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int res;
 
+	lock_kernel();
 	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-	if (res)
+	if (res) {
+		unlock_kernel();
 		return res;
+	}
 
 	sb->s_flags |= MS_NOATIME;
 	sb->s_root->d_op = &msdos_dentry_operations;
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fbaa..9006ad9c7b11 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
+#include <linux/smp_lock.h>     /* For lock_kernel() */
 #include "fat.h"
 
 /*
@@ -1055,15 +1056,19 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int res;
 
+	lock_kernel();
 	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-	if (res)
+	if (res) {
+		unlock_kernel();
 		return res;
+	}
 
 	if (MSDOS_SB(sb)->options.name_check != 's')
 		sb->s_root->d_op = &vfat_ci_dentry_ops;
 	else
 		sb->s_root->d_op = &vfat_dentry_ops;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index dc0c041e85cb..eb2b9e09c996 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -148,7 +148,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
  *   The superblock on success, else %NULL.
  *
  * Locking:
- *   We are under the bkl and @sbp->s_lock.
+ *   We are under @sbp->s_lock.
  */
 static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 {
@@ -159,11 +159,14 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 	struct inode *root;
 	int ret = -EINVAL;
 
+	lock_kernel();
+
 	sbp->s_flags |= MS_RDONLY;
 
 	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
 	if (!infp) {
 		printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
+		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -236,6 +239,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 		goto out_free_ilist;
 	}
 
+	unlock_kernel();
 	return 0;
 	
 out_free_ilist:
@@ -245,6 +249,7 @@ out_free_ilist:
 out:
 	brelse(bp);
 	kfree(infp);
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34235d4bf08b..3069416fa8ef 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -382,9 +382,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root_inode;
 	int res;
 
+	lock_kernel();
+
 	sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
 
@@ -435,6 +439,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_root->d_op = &hfs_dentry_operations;
 
 	/* everything's okay */
+	unlock_kernel();
 	return 0;
 
 bail_iput:
@@ -443,6 +448,7 @@ bail_no_root:
 	printk(KERN_ERR "hfs: get root inode failed.\n");
 bail:
 	hfs_mdb_put(sb);
+	unlock_kernel();
 	return res;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 2607010be2fe..c969a1aa163a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,11 +477,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	int o;
 
+	lock_kernel();
+
 	save_mount_options(s, options);
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	s->s_fs_info = sbi;
 
 	sbi->sb_bmp_dir = NULL;
@@ -666,6 +670,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 			root->i_blocks = 5;
 		hpfs_brelse4(&qbh);
 	}
+	unlock_kernel();
 	return 0;
 
 bail4:	brelse(bh2);
@@ -677,6 +682,7 @@ bail0:
 	kfree(sbi->sb_cp_table);
 	s->s_fs_info = NULL;
 	kfree(sbi);
+	unlock_kernel();
 	return -EINVAL;
 }
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5a44811b5027..05baf7721e8c 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -571,11 +571,15 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 	int table, error = -EINVAL;
 	unsigned int vol_desc_start;
 
+	lock_kernel();
+
 	save_mount_options(s, data);
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	s->s_fs_info = sbi;
 
 	if (!parse_options((char *)data, &opt))
@@ -900,6 +904,7 @@ root_found:
 
 	kfree(opt.iocharset);
 
+	unlock_kernel();
 	return 0;
 
 	/*
@@ -939,6 +944,7 @@ out_freesbi:
 	kfree(opt.iocharset);
 	kfree(sbi);
 	s->s_fs_info = NULL;
+	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 662bba099501..58dd9cf06206 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -146,14 +146,19 @@ static const struct super_operations jffs2_super_operations =
 static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct jffs2_sb_info *c;
+	int ret;
+
+	lock_kernel();
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
 		  " New superblock for device %d (\"%s\")\n",
 		  sb->s_mtd->index, sb->s_mtd->name));
 
 	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c)
+	if (!c) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 
 	c->mtd = sb->s_mtd;
 	c->os_priv = sb;
@@ -175,7 +180,9 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 	sb->s_flags |= MS_POSIXACL;
 #endif
-	return jffs2_do_fill_super(sb, data, silent);
+	ret = jffs2_do_fill_super(sb, data, silent);
+	unlock_kernel();
+	return ret;
 }
 
 static int jffs2_get_sb(struct file_system_type *fs_type,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ec8c3e4baca3..eb31f677347d 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -438,14 +438,20 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	s64 newLVSize = 0;
 	int flag, ret = -EINVAL;
 
+	lock_kernel();
+
 	jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
 
-	if (!new_valid_dev(sb->s_bdev->bd_dev))
+	if (!new_valid_dev(sb->s_bdev->bd_dev)) {
+		unlock_kernel();
 		return -EOVERFLOW;
+	}
 
 	sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->sb = sb;
 	sbi->uid = sbi->gid = sbi->umask = -1;
@@ -542,6 +548,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
 #endif
 	sb->s_time_gran = 1;
+	unlock_kernel();
 	return 0;
 
 out_no_root:
@@ -564,6 +571,7 @@ out_unload:
 		unload_nls(sbi->nls_tab);
 out_kfree:
 	kfree(sbi);
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..0d573c2a6860 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1113,9 +1113,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
+	lock_kernel();
 	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
-	if (IS_ERR(sd.bdev))
+	if (IS_ERR(sd.bdev)) {
+		unlock_kernel();
 		return PTR_ERR(sd.bdev);
+	}
 
 	/*
 	 * To get mount instance using sget() vfs-routine, NILFS needs
@@ -1198,6 +1201,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (need_to_close)
 		close_bdev_exclusive(sd.bdev, mode);
 	simple_set_mnt(mnt, s);
+	unlock_kernel();
 	return 0;
 
  failed_unlock:
@@ -1206,6 +1210,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
  failed:
 	close_bdev_exclusive(sd.bdev, mode);
 
+	unlock_kernel();
 	return err;
 
  cancel_new:
@@ -1218,6 +1223,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 * We must finish all post-cleaning before this call;
 	 * put_nilfs() needs the block device.
 	 */
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 512806171bfa..1f31e77fc41f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2732,6 +2732,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	struct inode *tmp_ino;
 	int blocksize, result;
 
+	lock_kernel();
+
 	/*
 	 * We do a pretty difficult piece of bootstrap by reading the
 	 * MFT (and other metadata) from disk into memory. We'll only
@@ -2755,6 +2757,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 			ntfs_error(sb, "Allocation of NTFS volume structure "
 					"failed. Aborting mount...");
 		lockdep_on();
+		unlock_kernel();
 		return -ENOMEM;
 	}
 	/* Initialize ntfs_volume structure. */
@@ -2942,6 +2945,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		sb->s_export_op = &ntfs_export_ops;
 		lock_kernel();
 		lockdep_on();
+		unlock_kernel();
 		return 0;
 	}
 	ntfs_error(sb, "Failed to allocate root directory.");
@@ -3062,6 +3066,7 @@ err_out_now:
 	kfree(vol);
 	ntfs_debug("Failed, returning -EINVAL.");
 	lockdep_on();
+	unlock_kernel();
 	return -EINVAL;
 }
 
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..667d7ceba8c9 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -44,6 +44,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/poll.h>
+#include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
 
@@ -588,21 +589,27 @@ static int dlmfs_fill_super(struct super_block * sb,
 	struct inode * inode;
 	struct dentry * root;
 
+	lock_kernel();
+
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = DLMFS_MAGIC;
 	sb->s_op = &dlmfs_ops;
 	inode = dlmfs_get_root_inode(sb);
-	if (!inode)
+	if (!inode) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
+		unlock_kernel();
 		return -ENOMEM;
 	}
 	sb->s_root = root;
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..b7e4f2d19d40 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1002,6 +1002,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	char nodestr[8];
 	struct ocfs2_blockcheck_stats stats;
 
+	lock_kernel();
+
 	mlog_entry("%p, %p, %i", sb, data, silent);
 
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
@@ -1179,6 +1181,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 			atomic_set(&osb->vol_state, VOLUME_DISABLED);
 			wake_up(&osb->osb_mount_event);
 			mlog_exit(status);
+			unlock_kernel();
 			return status;
 		}
 	}
@@ -1193,6 +1196,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	ocfs2_orphan_scan_start(osb);
 
 	mlog_exit(status);
+	unlock_kernel();
 	return status;
 
 read_super_error:
@@ -1208,6 +1212,7 @@ read_super_error:
 	}
 
 	mlog_exit(status);
+	unlock_kernel();
 	return status;
 }
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 16829722be93..86a7be1399a8 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -234,9 +234,13 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	struct qnx4_sb_info *qs;
 	int ret = -EINVAL;
 
+	lock_kernel();
+
 	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
-	if (!qs)
+	if (!qs) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	s->s_fs_info = qs;
 
 	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
@@ -284,6 +288,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
 	brelse(bh);
 
+	unlock_kernel();
 	return 0;
 
       outi:
@@ -293,6 +298,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
       outnobh:
 	kfree(qs);
 	s->s_fs_info = NULL;
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 450c91941988..8fc5e50e142f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -501,6 +501,8 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	void *mem;
 	static int warn_count;
 
+	lock_kernel();
+
 	if (warn_count < 5) {
 		warn_count++;
 		printk(KERN_EMERG "smbfs is deprecated and will be removed"
@@ -621,6 +623,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 
 	smb_new_dentry(sb->s_root);
 
+	unlock_kernel();
 	return 0;
 
 out_no_root:
@@ -643,9 +646,11 @@ out_wrong_data:
 out_no_data:
 	printk(KERN_ERR "smb_fill_super: missing data argument\n");
 out_fail:
+	unlock_kernel();
 	return -EINVAL;
 out_no_server:
 	printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
+	unlock_kernel();
 	return -ENOMEM;
 }
 
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f8606652..ab1a401a0e19 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -87,11 +87,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	u64 lookup_table_start, xattr_id_table_start;
 	int err;
 
+	lock_kernel();
+
 	TRACE("Entered squashfs_fill_superblock\n");
 
 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
 		ERROR("Failed to allocate squashfs_sb_info\n");
+		unlock_kernel();
 		return -ENOMEM;
 	}
 	msblk = sb->s_fs_info;
@@ -301,6 +304,7 @@ allocate_root:
 
 	TRACE("Leaving squashfs_fill_super\n");
 	kfree(sblk);
+	unlock_kernel();
 	return 0;
 
 failed_mount:
@@ -315,11 +319,13 @@ failed_mount:
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	kfree(sblk);
+	unlock_kernel();
 	return err;
 
 failure:
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+	unlock_kernel();
 	return -ENOMEM;
 }
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 65412d84a45d..76f3d6d97b40 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1880,6 +1880,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	struct kernel_lb_addr rootdir, fileset;
 	struct udf_sb_info *sbi;
 
+	lock_kernel();
+
 	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
 	uopt.uid = -1;
 	uopt.gid = -1;
@@ -1888,8 +1890,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	uopt.dmode = UDF_INVALID_MODE;
 
 	sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 
 	sb->s_fs_info = sbi;
 
@@ -2035,6 +2039,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		goto error_out;
 	}
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	unlock_kernel();
 	return 0;
 
 error_out:
@@ -2055,6 +2060,7 @@ error_out:
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 
+	unlock_kernel();
 	return -EINVAL;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d510c1b91817..6b9be90dae7d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned maxsymlen;
 	int ret = -EINVAL;
 
+	lock_kernel();
+
 	uspi = NULL;
 	ubh = NULL;
 	flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
 			goto failed;
 
 	UFSD("EXIT\n");
+	unlock_kernel();
 	return 0;
 
 dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 	UFSD("EXIT (FAILED)\n");
+	unlock_kernel();
 	return ret;
 
 failed_nomem:
 	UFSD("EXIT (NOMEM)\n");
+	unlock_kernel();
 	return -ENOMEM;
 }
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..a7ba3bccadc5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1430,6 +1430,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	struct super_block *sb;
 	struct cgroupfs_root *new_root;
 
+	lock_kernel();
+
 	/* First find the desired set of subsystems */
 	mutex_lock(&cgroup_mutex);
 	ret = parse_cgroupfs_options(data, &opts);
@@ -1559,6 +1561,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	simple_set_mnt(mnt, sb);
 	kfree(opts.release_agent);
 	kfree(opts.name);
+	unlock_kernel();
 	return 0;
 
  drop_new_super:
@@ -1568,6 +1571,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
  out_err:
 	kfree(opts.release_agent);
 	kfree(opts.name);
+	unlock_kernel();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 74c41429ae5889933a2b9b9d0d2f687baa410766 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Sun, 15 Aug 2010 22:52:36 +0200
Subject: BKL: Remove BKL from Amiga FFS

The BKL is only used in put_super, fill_super and remount_fs that are all
three protected by the superblocks s_umount rw_semaphore. Therefore it is
safe to remove the BKL entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/affs/super.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3a6d1dee4a51..a167f96d79f7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "affs.h"
 
@@ -46,8 +45,6 @@ affs_put_super(struct super_block *sb)
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	pr_debug("AFFS: put_super()\n");
 
-	lock_kernel();
-
 	if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
 		affs_commit_super(sb, 1, 1);
 
@@ -56,8 +53,6 @@ affs_put_super(struct super_block *sb)
 	affs_brelse(sbi->s_root_bh);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static void
@@ -291,8 +286,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	u8			 sig[4];
 	int			 ret = -EINVAL;
 
-	lock_kernel();
-
 	save_mount_options(sb, data);
 
 	pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
@@ -302,10 +295,9 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= MS_NODIRATIME;
 
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
-	if (!sbi) {
-		unlock_kernel();
+	if (!sbi)
 		return -ENOMEM;
-	}
+
 	sb->s_fs_info = sbi;
 	mutex_init(&sbi->s_bmlock);
 	spin_lock_init(&sbi->symlink_lock);
@@ -316,7 +308,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "AFFS: Error parsing options\n");
 		kfree(sbi->s_prefix);
 		kfree(sbi);
-		unlock_kernel();
 		return -EINVAL;
 	}
 	/* N.B. after this point s_prefix must be released */
@@ -487,7 +478,6 @@ got_root:
 	sb->s_root->d_op = &affs_dentry_operations;
 
 	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
-	unlock_kernel();
 	return 0;
 
 	/*
@@ -502,7 +492,6 @@ out_error_noinode:
 	kfree(sbi->s_prefix);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-	unlock_kernel();
 	return ret;
 }
 
@@ -534,7 +523,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
-	lock_kernel();
+
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -550,17 +539,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	memcpy(sbi->s_volume, volume, 32);
 	spin_unlock(&sbi->symlink_lock);
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
-	}
+
 	if (*flags & MS_RDONLY) {
 		affs_write_super(sb);
 		affs_free_bitmap(sb);
 	} else
 		res = affs_init_bitmap(sb, flags);
 
-	unlock_kernel();
 	return res;
 }
 
-- 
cgit v1.2.3


From ba13d597a60a1a26614f18b76c1a2cad1a548e46 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:29 +0100
Subject: BKL: Remove BKL from BFS

The BKL is only used in put_super and fill_super that are both protected by
the superblocks s_umount rw_semaphore. Therefore it is safe to remove the BKL
entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/bfs/inode.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index d2e09363dd93..883e77acd5a8 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -215,14 +214,10 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
-	lock_kernel();
-
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -322,13 +317,9 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	int ret = -EINVAL;
 	unsigned long i_sblock, i_eblock, i_eoff, s_size;
 
-	lock_kernel();
-
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info) {
-		unlock_kernel();
+	if (!info)
 		return -ENOMEM;
-	}
 	mutex_init(&info->bfs_lock);
 	s->s_fs_info = info;
 
@@ -443,7 +434,6 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	brelse(bh);
 	brelse(sbh);
 	dump_imap("read_super", s);
-	unlock_kernel();
 	return 0;
 
 out3:
@@ -457,7 +447,6 @@ out:
 	mutex_destroy(&info->bfs_lock);
 	kfree(info);
 	s->s_fs_info = NULL;
-	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From b0991aa324b57dca8feef75ed75b24080ee4a9fc Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:29 +0100
Subject: BKL: Remove BKL from CifsFS

The BKL is only used in put_super and fill_super that are both protected by
the superblocks s_umount rw_semaphore. Therefore it is safe to remove the
BKL entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Cc: Steve French <smfrench@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/cifs/cifsfs.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 070bf1aecd2d..4e273f7793f6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,6 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/smp_lock.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -200,8 +199,6 @@ cifs_put_super(struct super_block *sb)
 		return;
 	}
 
-	lock_kernel();
-
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
 		cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +212,6 @@ cifs_put_super(struct super_block *sb)
 	unload_nls(cifs_sb->local_nls);
 	bdi_destroy(&cifs_sb->bdi);
 	kfree(cifs_sb);
-
-	unlock_kernel();
 }
 
 static int
@@ -516,28 +511,22 @@ cifs_get_sb(struct file_system_type *fs_type,
 	int rc;
 	struct super_block *sb;
 
-	lock_kernel();
-
 	sb = sget(fs_type, NULL, set_anon_super, NULL);
 
 	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
 
-	if (IS_ERR(sb)) {
-		unlock_kernel();
+	if (IS_ERR(sb))
 		return PTR_ERR(sb);
-	}
 
 	sb->s_flags = flags;
 
 	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
 	if (rc) {
 		deactivate_locked_super(sb);
-		unlock_kernel();
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
 	simple_set_mnt(mnt, sb);
-	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v1.2.3


From d646cf82e9b6a58ba2d748e66e5fc7223830c68c Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:29 +0100
Subject: BKL: Remove BKL from ext3 fill_super()

The BKL is protecting nothing than two memory allocations here.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext3/super.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 41f9dcd73e97..edde63fdb9a1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1611,19 +1611,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	__le32 features;
 	int err;
 
-	lock_kernel();
-
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi) {
-		unlock_kernel();
+	if (!sbi)
 		return -ENOMEM;
-	}
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 	if (!sbi->s_blockgroup_lock) {
 		kfree(sbi);
-		unlock_kernel();
 		return -ENOMEM;
 	}
 	sb->s_fs_info = sbi;
@@ -1632,8 +1627,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_resgid = EXT3_DEF_RESGID;
 	sbi->s_sb_block = sb_block;
 
-	unlock_kernel();
-
 	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -2030,8 +2023,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	lock_kernel();
-	unlock_kernel();
 	return 0;
 
 cantfind_ext3:
@@ -2061,8 +2052,6 @@ out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	lock_kernel();
-	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 77b54a46a83232d172d55013c3af838190615260 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:30 +0100
Subject: BKL: Remove BKL from ext3_put_super() and ext3_remount()

The BKL lock is protecting the remounting against a potential call to
ext3_put_super(). This could not happen, since this is protected by the
s_umount rw semaphore of struct super_block.

Therefore I think the BKL is protecting nothing here.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext3/super.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index edde63fdb9a1..2e20bd771337 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -411,9 +411,6 @@ static void ext3_put_super (struct super_block * sb)
 	int i, err;
 
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-
-	lock_kernel();
-
 	ext3_xattr_put_super(sb);
 	err = journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
@@ -462,8 +459,6 @@ static void ext3_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache *ext3_inode_cachep;
@@ -2534,8 +2529,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
-	lock_kernel();
-
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -2644,7 +2637,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 
 	if (enable_quota)
 		dquot_resume(sb, -1);
@@ -2665,7 +2657,6 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 	return err;
 }
 
-- 
cgit v1.2.3


From f2143c4e2ebc6be3f07b7c7527dae7313fde23e1 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:30 +0100
Subject: BKL: Remove BKL from ext4 filesystem

The BKL is still used in ext4_put_super(), ext4_fill_super() and
ext4_remount(). All three calles are protected against concurrent calls by
the s_umount rw semaphore of struct super_block.

Therefore the BKL is protecting nothing in this case.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext4/super.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0f0021f4990c..24e7699f915d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -708,7 +707,6 @@ static void ext4_put_super(struct super_block *sb)
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
 	lock_super(sb);
-	lock_kernel();
 	if (sb->s_dirt)
 		ext4_commit_super(sb, 1);
 
@@ -775,7 +773,6 @@ static void ext4_put_super(struct super_block *sb)
 	 * Now that we are completely done shutting down the
 	 * superblock, we need to actually destroy the kobject.
 	 */
-	unlock_kernel();
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
@@ -2568,8 +2565,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 
-	lock_kernel();
-
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		goto out_free_orig;
@@ -2590,8 +2585,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sbi->s_sectors_written_start =
 			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
 
-	unlock_kernel();
-
 	/* Cleanup superblock name */
 	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
 		*cp = '!';
@@ -3214,11 +3207,7 @@ out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	kfree(orig_data);
-	return ret;
-
 out_free_orig:
-	unlock_kernel();
 	kfree(orig_data);
 	return ret;
 }
@@ -3726,8 +3715,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 
-	lock_kernel();
-
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -3862,7 +3849,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 	if (enable_quota)
 		dquot_resume(sb, -1);
 
@@ -3888,7 +3874,6 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 	kfree(orig_data);
 	return err;
 }
-- 
cgit v1.2.3


From 8526fb37c907f8f4eb9965367af6aeec76aedbce Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:30 +0100
Subject: BKL: Remove BKL from HFS

The BKL is only used in put_super and fill_super that are both protected by
the superblocks s_umount rw_semaphore. Therefore it is safe to remove the
BKL entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/hfs/super.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3069416fa8ef..33254160f650 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 
 #include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
  */
 static void hfs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	if (sb->s_dirt)
 		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
-
-	unlock_kernel();
 }
 
 /*
@@ -382,13 +377,10 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root_inode;
 	int res;
 
-	lock_kernel();
-
 	sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
-	if (!sbi) {
-		unlock_kernel();
+	if (!sbi)
 		return -ENOMEM;
-	}
+
 	sb->s_fs_info = sbi;
 	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
 
@@ -439,7 +431,6 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_root->d_op = &hfs_dentry_operations;
 
 	/* everything's okay */
-	unlock_kernel();
 	return 0;
 
 bail_iput:
@@ -448,7 +439,6 @@ bail_no_root:
 	printk(KERN_ERR "hfs: get root inode failed.\n");
 bail:
 	hfs_mdb_put(sb);
-	unlock_kernel();
 	return res;
 }
 
-- 
cgit v1.2.3


From 22b26db6f82bfa9a7f2b44443af3b5541927a130 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:31 +0100
Subject: BKL: Remove BKL from JFS

The BKL is only used in put_super, fill_super and remount_fs that are all
three protected by the superblocks s_umount rw_semaphore. Therefore it is
safe to remove the BKL entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/jfs/super.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index eb31f677347d..68eee2bf629e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -176,8 +175,6 @@ static void jfs_put_super(struct super_block *sb)
 
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
-	lock_kernel();
-
 	rc = jfs_umount(sb);
 	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
@@ -188,8 +185,6 @@ static void jfs_put_super(struct super_block *sb)
 	iput(sbi->direct_inode);
 
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 enum {
@@ -369,19 +364,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
-	lock_kernel();
+
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_ERR
 		  "JFS: resize requires volume to be mounted read-write\n");
-			unlock_kernel();
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
-		if (rc) {
-			unlock_kernel();
+		if (rc)
 			return rc;
-		}
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -397,36 +389,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		/* mark the fs r/w for quota activity */
 		sb->s_flags &= ~MS_RDONLY;
 
-		unlock_kernel();
 		dquot_resume(sb, -1);
 		return ret;
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = dquot_suspend(sb, -1);
 		if (rc < 0) {
-			unlock_kernel();
 			return rc;
 		}
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
-		unlock_kernel();
 		return rc;
 	}
 	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
 		if (!(sb->s_flags & MS_RDONLY)) {
 			rc = jfs_umount_rw(sb);
-			if (rc) {
-				unlock_kernel();
+			if (rc)
 				return rc;
-			}
+
 			JFS_SBI(sb)->flag = flag;
 			ret = jfs_mount_rw(sb, 1);
-			unlock_kernel();
 			return ret;
 		}
 	JFS_SBI(sb)->flag = flag;
 
-	unlock_kernel();
 	return 0;
 }
 
@@ -438,20 +424,15 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	s64 newLVSize = 0;
 	int flag, ret = -EINVAL;
 
-	lock_kernel();
-
 	jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
 
-	if (!new_valid_dev(sb->s_bdev->bd_dev)) {
-		unlock_kernel();
+	if (!new_valid_dev(sb->s_bdev->bd_dev))
 		return -EOVERFLOW;
-	}
 
 	sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
-	if (!sbi) {
-		unlock_kernel();
+	if (!sbi)
 		return -ENOMEM;
-	}
+
 	sb->s_fs_info = sbi;
 	sbi->sb = sb;
 	sbi->uid = sbi->gid = sbi->umask = -1;
@@ -548,7 +529,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
 #endif
 	sb->s_time_gran = 1;
-	unlock_kernel();
 	return 0;
 
 out_no_root:
@@ -571,7 +551,6 @@ out_unload:
 		unload_nls(sbi->nls_tab);
 out_kfree:
 	kfree(sbi);
-	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From d6d4c19c5f9ac5972e30e89b3c81ad1fd6e11fee Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:32 +0100
Subject: BKL: Remove BKL from NILFS2

The BKL is only used in put_super, fill_super and remount_fs that are all
three protected by the superblocks s_umount rw_semaphore. Therefore it is
safe to remove the BKL entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/nilfs2/ioctl.c |  1 -
 fs/nilfs2/super.c | 18 +-----------------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b0..0442ee3b394f 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
 
 #include <linux/fs.h>
 #include <linux/wait.h>
-#include <linux/smp_lock.h>	/* lock_kernel(), unlock_kernel() */
 #include <linux/slab.h>
 #include <linux/capability.h>	/* capable() */
 #include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0d573c2a6860..9f4913f78408 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,7 +45,6 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
 #include <linux/kobject.h>
@@ -342,8 +341,6 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	lock_kernel();
-
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -361,8 +358,6 @@ static void nilfs_put_super(struct super_block *sb)
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
 	nilfs_put_sbinfo(sbi);
-
-	unlock_kernel();
 }
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -949,8 +944,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct nilfs_mount_options old_opts;
 	int was_snapshot, err;
 
-	lock_kernel();
-
 	down_write(&nilfs->ns_super_sem);
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
@@ -1024,7 +1017,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	}
  out:
 	up_write(&nilfs->ns_super_sem);
-	unlock_kernel();
 	return 0;
 
  restore_opts:
@@ -1032,7 +1024,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
 	up_write(&nilfs->ns_super_sem);
-	unlock_kernel();
 	return err;
 }
 
@@ -1113,12 +1104,9 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	lock_kernel();
 	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
-	if (IS_ERR(sd.bdev)) {
-		unlock_kernel();
+	if (IS_ERR(sd.bdev))
 		return PTR_ERR(sd.bdev);
-	}
 
 	/*
 	 * To get mount instance using sget() vfs-routine, NILFS needs
@@ -1201,7 +1189,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (need_to_close)
 		close_bdev_exclusive(sd.bdev, mode);
 	simple_set_mnt(mnt, s);
-	unlock_kernel();
 	return 0;
 
  failed_unlock:
@@ -1209,8 +1196,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	put_nilfs(nilfs);
  failed:
 	close_bdev_exclusive(sd.bdev, mode);
-
-	unlock_kernel();
 	return err;
 
  cancel_new:
@@ -1223,7 +1208,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 * We must finish all post-cleaning before this call;
 	 * put_nilfs() needs the block device.
 	 */
-	unlock_kernel();
 	return err;
 }
 
-- 
cgit v1.2.3


From efdffb54034a6fc96bc7fafa33ca2a5503113eee Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:33 +0100
Subject: BKL: Remove BKL from NTFS

The BKL is only used in put_super, fill_super and remount_fs that are all
three protected by the superblocks s_umount rw_semaphore. Therefore it is
safe to remove the BKL entirely.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ntfs/super.c | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1f31e77fc41f..19c5180f8a28 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 
 #include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	ntfs_debug("Entering with remount options string: %s", opt);
 
-	lock_kernel();
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 		if (NVolErrors(vol)) {
 			ntfs_error(sb, "Volume has errors and is read-only%s",
 					es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_IS_DIRTY) {
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
 			ntfs_error(sb, "Volume has been modified by chkdsk "
 					"and is read-only%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 					"(0x%x) and is read-only%s",
 					(unsigned)le16_to_cpu(vol->vol_flags),
 					es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
 			ntfs_error(sb, "Failed to set dirty bit in volume "
 					"information flags%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 #if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Failed to empty journal $LogFile%s",
 					es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_mark_quotas_out_of_date(vol)) {
 			ntfs_error(sb, "Failed to mark quotas out of date%s",
 					es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_stamp_usnjrnl(vol)) {
 			ntfs_error(sb, "Failed to stamp transation log "
 					"($UsnJrnl)%s", es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	// TODO: Deal with *flags.
 
-	if (!parse_options(vol, opt)) {
-		unlock_kernel();
+	if (!parse_options(vol, opt))
 		return -EINVAL;
-	}
-	unlock_kernel();
+
 	ntfs_debug("Done.");
 	return 0;
 }
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
 
 	ntfs_debug("Entering.");
 
-	lock_kernel();
-
 #ifdef NTFS_RW
 	/*
 	 * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(vol);
-
-	unlock_kernel();
 }
 
 /**
@@ -2732,8 +2716,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	struct inode *tmp_ino;
 	int blocksize, result;
 
-	lock_kernel();
-
 	/*
 	 * We do a pretty difficult piece of bootstrap by reading the
 	 * MFT (and other metadata) from disk into memory. We'll only
@@ -2757,7 +2739,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 			ntfs_error(sb, "Allocation of NTFS volume structure "
 					"failed. Aborting mount...");
 		lockdep_on();
-		unlock_kernel();
 		return -ENOMEM;
 	}
 	/* Initialize ntfs_volume structure. */
@@ -2775,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	init_rwsem(&vol->mftbmp_lock);
 	init_rwsem(&vol->lcnbmp_lock);
 
-	unlock_kernel();
-
 	/* By default, enable sparse support. */
 	NVolSetSparseEnabled(vol);
 
@@ -2943,9 +2922,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		}
 		mutex_unlock(&ntfs_lock);
 		sb->s_export_op = &ntfs_export_ops;
-		lock_kernel();
 		lockdep_on();
-		unlock_kernel();
 		return 0;
 	}
 	ntfs_error(sb, "Failed to allocate root directory.");
@@ -3061,12 +3038,10 @@ iput_tmp_ino_err_out_now:
 	}
 	/* Errors at this stage are irrelevant. */
 err_out_now:
-	lock_kernel();
 	sb->s_fs_info = NULL;
 	kfree(vol);
 	ntfs_debug("Failed, returning -EINVAL.");
 	lockdep_on();
-	unlock_kernel();
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 6841c05021236b8d394cc6c41aa6ae17623aef13 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:35 +0100
Subject: BKL: Remove BKL from do_new_mount()

After pushing down the BKL to the get_sb/fill_super operations of the
filesystems that still make usage of the BKL it is safe to remove it from
do_new_mount().

I've read through all the code formerly covered by the BKL inside
do_kern_mount() and have satisfied myself that it doesn't need the BKL
any more.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f2..7ca5182c0bed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1744,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	lock_kernel();
 	mnt = do_kern_mount(type, flags, name, data);
-	unlock_kernel();
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-- 
cgit v1.2.3


From 3e44f9f1dc19e2b8d4f7ca3b3c63e976b22ad372 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@infradead.org>
Date: Wed, 24 Feb 2010 13:25:35 +0100
Subject: BKL: Remove BKL from ext2 filesystem

The BKL is still used in ext2_put_super(), ext2_fill_super(), ext2_sync_fs()
ext2_remount() and ext2_write_inode(). From these calls ext2_put_super(),
ext2_fill_super() and ext2_remount() are protected against each other by
the struct super_block s_umount rw semaphore. The call in ext2_write_inode()
could only protect the modification of the ext2_sb_info through
ext2_update_dynamic_rev() against concurrent ext2_sync_fs() or ext2_remount().
ext2_fill_super() and ext2_put_super() can be left out because you need a
valid filesystem reference in all three cases, which you do not have when
you are one of these functions.

If the BKL is only protecting the modification of the ext2_sb_info it can
safely be removed since this is protected by the struct ext2_sb_info s_lock.

Signed-off-by: Jan Blunck <jblunck@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext2/super.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f98c390caf1d..85df87d0f7b7 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -747,8 +747,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	__le32 features;
 	int err;
 
-	lock_kernel();
-
 	err = -ENOMEM;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -1086,7 +1084,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
 		sb->s_flags |= MS_RDONLY;
 	ext2_write_super(sb);
-	unlock_kernel();
 	return 0;
 
 cantfind_ext2:
@@ -1112,7 +1109,6 @@ failed_sbi:
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 failed_unlock:
-	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3768744cfea7b995dce27f02341161fbfdfee80c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Sep 2010 23:07:27 +0200
Subject: BKL: Remove BKL from fat

The lock_kernel in fat_put_super is not needed because
it only protects the super block itself and we know that
no other thread can reach it because we are about to
kfree the object.

In the two fill_super functions, this converts the locking
to use lock_super like elsewhere in the fat code. This
is probably not needed either, but is consistent and puts
us on the safe side.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Jan Blunck <jblunck@infradead.org>
---
 fs/fat/inode.c       | 5 -----
 fs/fat/namei_msdos.c | 7 +++----
 fs/fat/namei_vfat.c  | 7 +++----
 3 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 830058057d33..ad6998a92c30 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
@@ -489,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		fat_write_super(sb);
 
@@ -504,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache *fat_inode_cachep;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e2b0b978340d..bbca5c186ae7 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>     /* For lock_kernel() */
 #include "fat.h"
 
 /* Characters that are undesirable in an MS-DOS file name */
@@ -663,16 +662,16 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int res;
 
-	lock_kernel();
+	lock_super(sb);
 	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
 	if (res) {
-		unlock_kernel();
+		unlock_super(sb);
 		return res;
 	}
 
 	sb->s_flags |= MS_NOATIME;
 	sb->s_root->d_op = &msdos_dentry_operations;
-	unlock_kernel();
+	unlock_super(sb);
 	return 0;
 }
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 9006ad9c7b11..6f0f6c9a0152 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -21,7 +21,6 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
-#include <linux/smp_lock.h>     /* For lock_kernel() */
 #include "fat.h"
 
 /*
@@ -1056,10 +1055,10 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int res;
 
-	lock_kernel();
+	lock_super(sb);
 	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
 	if (res) {
-		unlock_kernel();
+		unlock_super(sb);
 		return res;
 	}
 
@@ -1068,7 +1067,7 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		sb->s_root->d_op = &vfat_dentry_ops;
 
-	unlock_kernel();
+	unlock_super(sb);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4f819a7899b06afcd7623ab9d00fd81503ad3e24 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 12 Sep 2010 19:05:56 +0200
Subject: BKL: Remove BKL from isofs

As in other file systems, we can replace the big kernel lock
with a private mutex in isofs. This means we can now access
multiple file systems concurrently, but it also means that
we serialize readdir and lookup across sleeping operations
which previously released the big kernel lock. This should
not matter though, as these operations are in practice
serialized through the hardware access.

The isofs_get_blocks functions now does not take any lock
any more, it used to recursively get the BKL. After looking
at the code for hours, I convinced myself that it was never
needed here anyway, because it only reads constant fields
of the inode and writes to a buffer head array that is
at this time only visible to the caller.

The get_sb and fill_super operations do not need the locking
at all because they operate on a file system that is either
about to be created or to be destroyed but in either case
is not visible to other threads.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/isofs/dir.c   |  6 +++---
 fs/isofs/inode.c | 17 ++---------------
 fs/isofs/isofs.h |  1 +
 fs/isofs/namei.c |  8 ++++----
 fs/isofs/rock.c  | 10 +++++-----
 5 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac68..0542b6eedf80 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
  *
  *  isofs directory handling functions
  */
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
 
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
 	char *tmpname;
 	struct iso_directory_record *tmpde;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
 
 	tmpname = (char *)__get_free_page(GFP_KERNEL);
 	if (tmpname == NULL)
 		return -ENOMEM;
 
-	lock_kernel();
+	mutex_lock(&sbi->s_mutex);
 	tmpde = (struct iso_directory_record *) (tmpname+1024);
 
 	result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
 
 	free_page((unsigned long) tmpname);
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 	return result;
 }
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 05baf7721e8c..09ff41a752a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/nls.h>
 #include <linux/ctype.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
@@ -44,11 +43,7 @@ static void isofs_put_super(struct super_block *sb)
 	struct isofs_sb_info *sbi = ISOFS_SB(sb);
 
 #ifdef CONFIG_JOLIET
-	lock_kernel();
-
 	unload_nls(sbi->s_nls_iocharset);
-
-	unlock_kernel();
 #endif
 
 	kfree(sbi);
@@ -571,15 +566,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 	int table, error = -EINVAL;
 	unsigned int vol_desc_start;
 
-	lock_kernel();
-
 	save_mount_options(s, data);
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi) {
-		unlock_kernel();
+	if (!sbi)
 		return -ENOMEM;
-	}
 	s->s_fs_info = sbi;
 
 	if (!parse_options((char *)data, &opt))
@@ -827,6 +818,7 @@ root_found:
 	sbi->s_utf8 = opt.utf8;
 	sbi->s_nocompress = opt.nocompress;
 	sbi->s_overriderockperm = opt.overriderockperm;
+	mutex_init(&sbi->s_mutex);
 	/*
 	 * It would be incredibly stupid to allow people to mark every file
 	 * on the disk as suid, so we merely allow them to set the default
@@ -904,7 +896,6 @@ root_found:
 
 	kfree(opt.iocharset);
 
-	unlock_kernel();
 	return 0;
 
 	/*
@@ -944,7 +935,6 @@ out_freesbi:
 	kfree(opt.iocharset);
 	kfree(sbi);
 	s->s_fs_info = NULL;
-	unlock_kernel();
 	return error;
 }
 
@@ -983,8 +973,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 	int section, rv, error;
 	struct iso_inode_info *ei = ISOFS_I(inode);
 
-	lock_kernel();
-
 	error = -EIO;
 	rv = 0;
 	if (iblock < 0 || iblock != iblock_s) {
@@ -1060,7 +1048,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 
 	error = 0;
 abort:
-	unlock_kernel();
 	return rv != 0 ? rv : error;
 }
 
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..2882dc089f87 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
 	gid_t s_gid;
 	uid_t s_uid;
 	struct nls_table *s_nls_iocharset; /* Native language support table */
+	struct mutex s_mutex; /* replaces BKL, please remove if possible */
 };
 
 #define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867c..0d23abfd4280 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
 
@@ -168,6 +167,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	int found;
 	unsigned long uninitialized_var(block);
 	unsigned long uninitialized_var(offset);
+	struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
 	struct inode *inode;
 	struct page *page;
 
@@ -177,7 +177,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	lock_kernel();
+	mutex_lock(&sbi->s_mutex);
 	found = isofs_find_entry(dir, dentry,
 				&block, &offset,
 				page_address(page),
@@ -188,10 +188,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	if (found) {
 		inode = isofs_iget(dir->i_sb, block, offset);
 		if (IS_ERR(inode)) {
-			unlock_kernel();
+			mutex_unlock(&sbi->s_mutex);
 			return ERR_CAST(inode);
 		}
 	}
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 	return d_splice_alias(inode, dentry);
 }
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550fd..f9cd04db6eab 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
 
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include "isofs.h"
 #include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct iso_inode_info *ei = ISOFS_I(inode);
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
 	char *link = kmap(page);
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
 	struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 	struct rock_state rs;
 	int ret;
 
-	if (!ISOFS_SB(inode->i_sb)->s_rock)
+	if (!sbi->s_rock)
 		goto error;
 
 	init_rock_state(&rs, inode);
 	block = ei->i_iget5_block;
-	lock_kernel();
+	mutex_lock(&sbi->s_mutex);
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh)
 		goto out_noread;
@@ -748,7 +748,7 @@ repeat:
 		goto fail;
 	brelse(bh);
 	*rpnt = '\0';
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 	SetPageUptodate(page);
 	kunmap(page);
 	unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
 	printk("symlink spans iso9660 blocks\n");
 fail:
 	brelse(bh);
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 error:
 	SetPageError(page);
 	kunmap(page);
-- 
cgit v1.2.3


From 00e300e1b682ea86fcb5216250722afd42686ab3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Sep 2010 23:00:34 +0200
Subject: BKL: Remove BKL from autofs4

autofs4 uses the BKL only to guard its ioctl operations.
This can be trivially converted to use a mutex, as we have
done with most device drivers before.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ian Kent <raven@themaw.net>
---
 fs/autofs4/root.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cb1bd38dc08c..9dd29c29fd12 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,7 +19,7 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 
 #include "autofs_i.h"
 
@@ -978,15 +978,17 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	}
 }
 
+static DEFINE_MUTEX(autofs4_ioctl_mutex);
+
 static long autofs4_root_ioctl(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
 	long ret;
 	struct inode *inode = filp->f_dentry->d_inode;
 
-	lock_kernel();
+	mutex_lock(&autofs4_ioctl_mutex);
 	ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-	unlock_kernel();
+	mutex_unlock(&autofs4_ioctl_mutex);
 
 	return ret;
 }
@@ -998,13 +1000,13 @@ static long autofs4_root_compat_ioctl(struct file *filp,
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret;
 
-	lock_kernel();
+	mutex_lock(&autofs4_ioctl_mutex);
 	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
 		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 	else
 		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
 			(unsigned long)compat_ptr(arg));
-	unlock_kernel();
+	mutex_unlock(&autofs4_ioctl_mutex);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 77f2fe036cd51300c80d1aca76bcf23a09977f13 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 16 Sep 2010 15:35:54 +0200
Subject: BKL: Remove BKL from afs

The BKL is only used in put_super and fill_super, which are both protected
by the superblocks s_umount rw_semaphore. Therefore it is safe to remove
the BKL entirely.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: linux-afs@lists.infradead.org
Cc: David Howells <dhowells@redhat.com>
---
 fs/afs/super.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/super.c b/fs/afs/super.c
index 6c2fef44d385..eacf76d98ae0 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -19,7 +19,6 @@
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
@@ -302,15 +301,12 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	struct inode *inode = NULL;
 	int ret;
 
-	lock_kernel();
-
 	_enter("");
 
 	/* allocate a superblock info record */
 	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
 	if (!as) {
 		_leave(" = -ENOMEM");
-		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -344,7 +340,6 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	sb->s_root = root;
 
 	_leave(" = 0");
-	unlock_kernel();
 	return 0;
 
 error_inode:
@@ -358,7 +353,6 @@ error:
 	sb->s_fs_info = NULL;
 
 	_leave(" = %d", ret);
-	unlock_kernel();
 	return ret;
 }
 
@@ -458,12 +452,8 @@ static void afs_put_super(struct super_block *sb)
 
 	_enter("");
 
-	lock_kernel();
-
 	afs_put_volume(as->volume);
 
-	unlock_kernel();
-
 	_leave("");
 }
 
-- 
cgit v1.2.3


From 18dfe89d7c394b56e20e06dde8b7a80ff96cf02c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 16 Sep 2010 15:35:54 +0200
Subject: BKL: Remove BKL from ecryptfs

The BKL is only used in fill_super, which is protected by the superblocks
s_umount rw_semaphorei, and in fasync, which does not do anything that
could require the BKL. Therefore it is safe to remove the BKL entirely.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: ecryptfs-devel@lists.launchpad.net
---
 fs/ecryptfs/file.c | 3 ---
 fs/ecryptfs/main.c | 4 ----
 2 files changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..2b9a644b7583 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -284,11 +283,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	int rc = 0;
 	struct file *lower_file = NULL;
 
-	lock_kernel();
 	lower_file = ecryptfs_file_to_lower(file);
 	if (lower_file->f_op && lower_file->f_op->fasync)
 		rc = lower_file->f_op->fasync(fd, lower_file, flag);
-	unlock_kernel();
 	return rc;
 }
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index c4af92fa12c3..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,7 +36,6 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>     /* For lock_kernel() */
 #include "ecryptfs_kernel.h"
 
 /**
@@ -551,7 +550,6 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 	const char *err = "Getting sb failed";
 	int rc;
 
-	lock_kernel();
 	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
 	if (!sbi) {
 		rc = -ENOMEM;
@@ -610,7 +608,6 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto out;
 	}
 	simple_set_mnt(mnt, s);
-	unlock_kernel();
 	return 0;
 
 out:
@@ -619,7 +616,6 @@ out:
 		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
 	}
 	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
-	unlock_kernel();
 	return rc;
 }
 
-- 
cgit v1.2.3


From 1a028dd2dd589c3924d9711a1b073a13c820b6b5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 16 Sep 2010 16:11:09 +0200
Subject: BKL: Remove BKL from jffs2

The BKL is only used in put_super, fill_super and remount_fs that are all
three protected by the superblocks s_umount rw_semaphore. Therefore it is
safe to remove the BKL entirely.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/fs.c    |  4 ----
 fs/jffs2/super.c | 12 +-----------
 2 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..d9beb06e6fca 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -21,7 +21,6 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include "nodelist.h"
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -391,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	   This also catches the case where it was stopped and this
 	   is just a remount to restart it.
 	   Flush the writebuffer, if neccecary, else we loose it */
-	lock_kernel();
 	if (!(sb->s_flags & MS_RDONLY)) {
 		jffs2_stop_garbage_collect_thread(c);
 		mutex_lock(&c->alloc_sem);
@@ -403,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 		jffs2_start_garbage_collect_thread(c);
 
 	*flags |= MS_NOATIME;
-
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 58dd9cf06206..d1ae5dfc22b9 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -148,17 +147,13 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 	struct jffs2_sb_info *c;
 	int ret;
 
-	lock_kernel();
-
 	D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
 		  " New superblock for device %d (\"%s\")\n",
 		  sb->s_mtd->index, sb->s_mtd->name));
 
 	c = kzalloc(sizeof(*c), GFP_KERNEL);
-	if (!c) {
-		unlock_kernel();
+	if (!c)
 		return -ENOMEM;
-	}
 
 	c->mtd = sb->s_mtd;
 	c->os_priv = sb;
@@ -181,7 +176,6 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= MS_POSIXACL;
 #endif
 	ret = jffs2_do_fill_super(sb, data, silent);
-	unlock_kernel();
 	return ret;
 }
 
@@ -199,8 +193,6 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		jffs2_write_super(sb);
 
@@ -222,8 +214,6 @@ static void jffs2_put_super (struct super_block *sb)
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
-	unlock_kernel();
-
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
 
-- 
cgit v1.2.3


From 3dbc4b32d0b39701cbec65582e196a20889155fb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 16 Sep 2010 15:35:54 +0200
Subject: BKL: Remove BKL from squashfs

The BKL is only used in put_super and fill_super, which are both protected
by the superblocks s_umount rw_semaphore. Therefore it is safe to remove
the BKL entirely.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/super.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index ab1a401a0e19..07a4f1156048 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
@@ -87,14 +86,11 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	u64 lookup_table_start, xattr_id_table_start;
 	int err;
 
-	lock_kernel();
-
 	TRACE("Entered squashfs_fill_superblock\n");
 
 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
 		ERROR("Failed to allocate squashfs_sb_info\n");
-		unlock_kernel();
 		return -ENOMEM;
 	}
 	msblk = sb->s_fs_info;
@@ -304,7 +300,6 @@ allocate_root:
 
 	TRACE("Leaving squashfs_fill_super\n");
 	kfree(sblk);
-	unlock_kernel();
 	return 0;
 
 failed_mount:
@@ -319,13 +314,11 @@ failed_mount:
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	kfree(sblk);
-	unlock_kernel();
 	return err;
 
 failure:
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-	unlock_kernel();
 	return -ENOMEM;
 }
 
@@ -360,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 
 static void squashfs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	if (sb->s_fs_info) {
 		struct squashfs_sb_info *sbi = sb->s_fs_info;
 		squashfs_cache_delete(sbi->block_cache);
@@ -376,8 +367,6 @@ static void squashfs_put_super(struct super_block *sb)
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
-
-	unlock_kernel();
 }
 
 
-- 
cgit v1.2.3


From 60056794127a25d641465b706e8828186f7a2e1f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 24 Feb 2010 13:25:33 +0100
Subject: BKL: Remove BKL from OCFS2

The BKL in ocfs2/dlmfs is used in put_super, fill_super and remount_fs
that are all three protected by the superblocks s_umount rw_semaphore.

The use in ocfs2_control_open is evidently unrelated and the function
is protected by ocfs2_control_lock.

Therefore it is safe to remove the BKL entirely.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmfs/dlmfs.c |  9 +--------
 fs/ocfs2/stack_user.c  |  3 ---
 fs/ocfs2/super.c       | 12 ------------
 3 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 667d7ceba8c9..c2903b84bb7a 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -44,7 +44,6 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/poll.h>
-#include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
 
@@ -589,27 +588,21 @@ static int dlmfs_fill_super(struct super_block * sb,
 	struct inode * inode;
 	struct dentry * root;
 
-	lock_kernel();
-
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = DLMFS_MAGIC;
 	sb->s_op = &dlmfs_ops;
 	inode = dlmfs_get_root_inode(sb);
-	if (!inode) {
-		unlock_kernel();
+	if (!inode)
 		return -ENOMEM;
-	}
 
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
-		unlock_kernel();
 		return -ENOMEM;
 	}
 	sb->s_root = root;
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..0e68f542ef2e 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	p->op_this_node = -1;
 
-	lock_kernel();
 	mutex_lock(&ocfs2_control_lock);
 	file->private_data = p;
 	list_add(&p->op_list, &ocfs2_control_private_list);
 	mutex_unlock(&ocfs2_control_lock);
-	unlock_kernel();
 
 	return 0;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b7e4f2d19d40..b7d724393b5a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -609,8 +609,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	lock_kernel();
-
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
 	    !ocfs2_check_set_options(sb, &parsed_options)) {
 		ret = -EINVAL;
@@ -717,7 +715,6 @@ unlock_osb:
 							MS_POSIXACL : 0);
 	}
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -1002,8 +999,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	char nodestr[8];
 	struct ocfs2_blockcheck_stats stats;
 
-	lock_kernel();
-
 	mlog_entry("%p, %p, %i", sb, data, silent);
 
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
@@ -1181,7 +1176,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 			atomic_set(&osb->vol_state, VOLUME_DISABLED);
 			wake_up(&osb->osb_mount_event);
 			mlog_exit(status);
-			unlock_kernel();
 			return status;
 		}
 	}
@@ -1196,7 +1190,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	ocfs2_orphan_scan_start(osb);
 
 	mlog_exit(status);
-	unlock_kernel();
 	return status;
 
 read_super_error:
@@ -1212,7 +1205,6 @@ read_super_error:
 	}
 
 	mlog_exit(status);
-	unlock_kernel();
 	return status;
 }
 
@@ -1645,13 +1637,9 @@ static void ocfs2_put_super(struct super_block *sb)
 {
 	mlog_entry("(0x%p)\n", sb);
 
-	lock_kernel();
-
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
 
-	unlock_kernel();
-
 	mlog_exit_void();
 }
 
-- 
cgit v1.2.3


From 2e54eb96e2c801f33d95b5dade15212ac4d6c4a5 Mon Sep 17 00:00:00 2001
From: Petr Vandrovec <petr@vandrovec.name>
Date: Mon, 27 Sep 2010 01:47:33 +0200
Subject: BKL: Remove BKL from ncpfs

Dozen of changes in ncpfs to provide some locking other than BKL.

In readdir cache unlock and mark complete first page as last operation,
so it can be used for synchronization, as code intended.

When updating dentry name on case insensitive filesystems do at least
some basic locking...

Hold i_mutex when updating inode fields.

Push some ncp_conn_is_valid down to ncp_request.  Connection can become
invalid at any moment, and fewer error code paths to test the better.

Use i_size_{read,write} to modify file size.

Set inode's backing_dev_info as ncpfs has its own special bdi.

In ioctl unbreak ioctls invoked on filesystem mounted 'ro' - tests are
for inode writeable or owner match, but were turned to filesystem
writeable and inode writeable or owner match.  Also collect all permission
checks in single place.

Add some locking, and remove comments saying that it would be cool to
add some locks to the code.

Constify some pointers.

Signed-off-by: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ncpfs/dir.c            | 221 +++++++++++++---------
 fs/ncpfs/file.c           |  25 +--
 fs/ncpfs/inode.c          |  41 ++--
 fs/ncpfs/ioctl.c          | 470 +++++++++++++++++++++++++---------------------
 fs/ncpfs/ncplib_kernel.c  | 101 +++++-----
 fs/ncpfs/ncplib_kernel.h  |  15 +-
 fs/ncpfs/ncpsign_kernel.c |  10 +-
 fs/ncpfs/sock.c           |   1 -
 include/linux/ncp_fs.h    |  28 ---
 include/linux/ncp_fs_sb.h |   4 +-
 10 files changed, 487 insertions(+), 429 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd58..aac8832e919e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -95,6 +95,34 @@ const struct dentry_operations ncp_root_dentry_operations =
 };
 
 
+#define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
+
+static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
+{
+#ifdef CONFIG_NCPFS_SMALLDOS
+	int ns = ncp_namespace(i);
+
+	if ((ns == NW_NS_DOS)
+#ifdef CONFIG_NCPFS_OS2_NS
+		|| ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
+#endif /* CONFIG_NCPFS_OS2_NS */
+	   )
+		return 0;
+#endif /* CONFIG_NCPFS_SMALLDOS */
+	return 1;
+}
+
+#define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
+
+static inline int ncp_case_sensitive(struct dentry *dentry)
+{
+#ifdef CONFIG_NCPFS_NFS_NS
+	return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+#else
+	return 0;
+#endif /* CONFIG_NCPFS_NFS_NS */
+}
+
 /*
  * Note: leave the hash unchanged if the directory
  * is case-sensitive.
@@ -102,13 +130,12 @@ const struct dentry_operations ncp_root_dentry_operations =
 static int 
 ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 {
-	struct nls_table *t;
-	unsigned long hash;
-	int i;
-
-	t = NCP_IO_TABLE(dentry);
+	if (!ncp_case_sensitive(dentry)) {
+		struct nls_table *t;
+		unsigned long hash;
+		int i;
 
-	if (!ncp_case_sensitive(dentry->d_inode)) {
+		t = NCP_IO_TABLE(dentry);
 		hash = init_name_hash();
 		for (i=0; i<this->len ; i++)
 			hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -124,7 +151,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	if (a->len != b->len)
 		return 1;
 
-	if (ncp_case_sensitive(dentry->d_inode))
+	if (ncp_case_sensitive(dentry))
 		return strncmp(a->name, b->name, a->len);
 
 	return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
@@ -266,7 +293,7 @@ leave_me:;
 
 
 static int
-__ncp_lookup_validate(struct dentry *dentry)
+ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ncp_server *server;
 	struct dentry *parent;
@@ -283,9 +310,6 @@ __ncp_lookup_validate(struct dentry *dentry)
 
 	server = NCP_SERVER(dir);
 
-	if (!ncp_conn_valid(server))
-		goto finished;
-
 	/*
 	 * Inspired by smbfs:
 	 * The default validation is based on dentry age:
@@ -304,8 +328,11 @@ __ncp_lookup_validate(struct dentry *dentry)
 	if (ncp_is_server_root(dir)) {
 		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 				 dentry->d_name.len, 1);
-		if (!res)
+		if (!res) {
 			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+		}
 	} else {
 		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 				 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +347,17 @@ __ncp_lookup_validate(struct dentry *dentry)
 	 * what we remember, it's not valid any more.
 	 */
 	if (!res) {
-		if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) {
+		struct inode *inode = dentry->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
 			ncp_new_dentry(dentry);
 			val=1;
 		} else
 			DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
 
-		ncp_update_inode2(dentry->d_inode, &finfo);
+		ncp_update_inode2(inode, &finfo);
+		mutex_unlock(&inode->i_mutex);
 	}
 
 finished:
@@ -335,16 +366,6 @@ finished:
 	return val;
 }
 
-static int
-ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-	int res;
-	lock_kernel();
-	res = __ncp_lookup_validate(dentry);
-	unlock_kernel();
-	return res;
-}
-
 static struct dentry *
 ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 {
@@ -411,8 +432,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int result, mtime_valid = 0;
 	time_t mtime = 0;
 
-	lock_kernel();
-
 	ctl.page  = NULL;
 	ctl.cache = NULL;
 
@@ -421,6 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		(int) filp->f_pos);
 
 	result = -EIO;
+	/* Do not generate '.' and '..' when server is dead. */
 	if (!ncp_conn_valid(server))
 		goto out;
 
@@ -532,6 +552,12 @@ read_really:
 	ctl.head.end = ctl.fpos - 1;
 	ctl.head.eof = ctl.valid;
 finished:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		SetPageUptodate(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+	}
 	if (page) {
 		cache->head = ctl.head;
 		kunmap(page);
@@ -539,23 +565,17 @@ finished:
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	if (ctl.page) {
-		kunmap(ctl.page);
-		SetPageUptodate(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-	}
 out:
-	unlock_kernel();
 	return result;
 }
 
 static int
 ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry)
+		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
+		int inval_childs)
 {
 	struct dentry *newdent, *dentry = filp->f_path.dentry;
-	struct inode *newino, *inode = dentry->d_inode;
+	struct inode *dir = dentry->d_inode;
 	struct ncp_cache_control ctl = *ctrl;
 	struct qstr qname;
 	int valid = 0;
@@ -564,9 +584,9 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
 	qname.len = sizeof(__name);
-	if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len,
+	if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
 			entry->i.entryName, entry->i.nameLen,
-			!ncp_preserve_entry_case(inode, entry->i.NSCreator)))
+			!ncp_preserve_entry_case(dir, entry->i.NSCreator)))
 		return 1; /* I'm not sure */
 
 	qname.name = __name;
@@ -584,22 +604,64 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 			goto end_advance;
 	} else {
 		hashed = 1;
-		memcpy((char *) newdent->d_name.name, qname.name,
-							newdent->d_name.len);
+
+		/* If case sensitivity changed for this volume, all entries below this one
+		   should be thrown away.  This entry itself is not affected, as its case
+		   sensitivity is controlled by its own parent. */
+		if (inval_childs)
+			shrink_dcache_parent(newdent);
+
+		/*
+		 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
+		 * case preserving yet case insensitive.  So we update dentry's name
+		 * as received from server.  We found dentry via d_lookup with our
+		 * hash, so we know that hash does not change, and so replacing name
+		 * should be reasonably safe.
+		 */
+		if (qname.len == newdent->d_name.len &&
+		    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
+			struct inode *inode = newdent->d_inode;
+
+			/*
+			 * Inside ncpfs all uses of d_name are either for debugging,
+			 * or on functions which acquire inode mutex (mknod, creat,
+			 * lookup).  So grab i_mutex here, to be sure.  d_path
+			 * uses dcache_lock when generating path, so we should too.
+			 * And finally d_compare is protected by dentry's d_lock, so
+			 * here we go.
+			 */
+			if (inode)
+				mutex_lock(&inode->i_mutex);
+			spin_lock(&dcache_lock);
+			spin_lock(&newdent->d_lock);
+			memcpy((char *) newdent->d_name.name, qname.name,
+								newdent->d_name.len);
+			spin_unlock(&newdent->d_lock);
+			spin_unlock(&dcache_lock);
+			if (inode)
+				mutex_unlock(&inode->i_mutex);
+		}
 	}
 
 	if (!newdent->d_inode) {
+		struct inode *inode;
+
 		entry->opened = 0;
-		entry->ino = iunique(inode->i_sb, 2);
-		newino = ncp_iget(inode->i_sb, entry);
-		if (newino) {
+		entry->ino = iunique(dir->i_sb, 2);
+		inode = ncp_iget(dir->i_sb, entry);
+		if (inode) {
 			newdent->d_op = &ncp_dentry_operations;
-			d_instantiate(newdent, newino);
+			d_instantiate(newdent, inode);
 			if (!hashed)
 				d_rehash(newdent);
 		}
-	} else
-		ncp_update_inode2(newdent->d_inode, entry);
+	} else {
+		struct inode *inode = newdent->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		ncp_update_inode2(inode, entry);
+		mutex_unlock(&inode->i_mutex);
+	}
 
 	if (newdent->d_inode) {
 		ino = newdent->d_inode->i_ino;
@@ -617,7 +679,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 		ctl.cache = NULL;
 		ctl.idx  -= NCP_DIRCACHE_SIZE;
 		ctl.ofs  += 1;
-		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
+		ctl.page  = grab_cache_page(&dir->i_data, ctl.ofs);
 		if (ctl.page)
 			ctl.cache = kmap(ctl.page);
 	}
@@ -633,7 +695,7 @@ end_advance:
 		if (!ino)
 			ino = find_inode_number(dentry, &qname);
 		if (!ino)
-			ino = iunique(inode->i_sb, 2);
+			ino = iunique(dir->i_sb, 2);
 		ctl.filled = filldir(dirent, qname.name, qname.len,
 				     filp->f_pos, ino, DT_UNKNOWN);
 		if (!ctl.filled)
@@ -660,6 +722,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
 			(unsigned long) filp->f_pos);
 
 	for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
+		int inval_dentry;
 
 		if (ncp_get_volume_info_with_number(server, i, &info) != 0)
 			return;
@@ -675,8 +738,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
 				info.volume_name);
 			continue;
 		}
+		inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
 		entry.volume = entry.i.volNumber;
-		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
 			return;
 	}
 }
@@ -739,7 +803,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
 			rpl += onerpl;
 			rpls -= onerpl;
 			entry.volume = entry.i.volNumber;
-			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
 				break;
 		}
 	} while (more);
@@ -775,17 +839,19 @@ int ncp_conn_logged_in(struct super_block *sb)
 		if (dent) {
 			struct inode* ino = dent->d_inode;
 			if (ino) {
+				ncp_update_known_namespace(server, volNumber, NULL);
 				NCP_FINFO(ino)->volNumber = volNumber;
 				NCP_FINFO(ino)->dirEntNum = dirEntNum;
 				NCP_FINFO(ino)->DosDirNum = DosDirNum;
+				result = 0;
 			} else {
 				DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
 			}
 		} else {
 			DPRINTK("ncpfs: sb->s_root == NULL!\n");
 		}
-	}
-	result = 0;
+	} else
+		result = 0;
 
 out:
 	return result;
@@ -799,7 +865,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 	int error, res, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
-	lock_kernel();
 	error = -EIO;
 	if (!ncp_conn_valid(server))
 		goto finished;
@@ -813,6 +878,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 				 dentry->d_name.len, 1);
 		if (!res)
 			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
 	} else {
 		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 				 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -846,7 +913,6 @@ add_entry:
 
 finished:
 	PPRINTK("ncp_lookup: result=%d\n", error);
-	unlock_kernel();
 	return ERR_PTR(error);
 }
 
@@ -887,11 +953,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
 	PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name, mode);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	ncp_age_dentry(server, dentry);
 	len = sizeof(__name);
 	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +978,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
 		if (result) {
 			if (result == 0x87)
 				error = -ENAMETOOLONG;
+			else if (result < 0)
+				error = result;
 			DPRINTK("ncp_create: %s/%s failed\n",
 				dentry->d_parent->d_name.name, dentry->d_name.name);
 			goto out;
@@ -935,7 +998,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
 
 	error = ncp_instantiate(dir, dentry, &finfo);
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -955,11 +1017,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DPRINTK("ncp_mkdir: making %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	ncp_age_dentry(server, dentry);
 	len = sizeof(__name);
 	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1024,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		goto out;
 
-	error = -EACCES;
-	if (ncp_open_create_file_or_subdir(server, dir, __name,
+	error = ncp_open_create_file_or_subdir(server, dir, __name,
 					   OC_MODE_CREATE, aDIR,
 					   cpu_to_le16(0xffff),
-					   &finfo) == 0)
-	{
+					   &finfo);
+	if (error == 0) {
 		if (ncp_is_nfs_extras(server, finfo.volume)) {
 			mode |= S_IFDIR;
 			finfo.i.nfs.mode = mode;
@@ -983,9 +1039,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 				goto out;
 		}
 		error = ncp_instantiate(dir, dentry, &finfo);
+	} else if (error > 0) {
+		error = -EACCES;
 	}
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -998,11 +1055,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	DPRINTK("ncp_rmdir: removing %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	error = -EBUSY;
 	if (!d_unhashed(dentry))
 		goto out;
@@ -1036,11 +1088,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 			error = -ENOENT;
 			break;
 		default:
-			error = -EACCES;
+			error = result < 0 ? result : -EACCES;
 			break;
        	}
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -1050,15 +1101,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 	struct ncp_server *server;
 	int error;
 
-	lock_kernel();
 	server = NCP_SERVER(dir);
 	DPRINTK("ncp_unlink: unlinking %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 	
-	error = -EIO;
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	/*
 	 * Check whether to close the file ...
 	 */
@@ -1097,12 +1143,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 			error = -ENOENT;
 			break;
 		default:
-			error = -EACCES;
+			error = error < 0 ? error : -EACCES;
 			break;
 	}
-		
-out:
-	unlock_kernel();
 	return error;
 }
 
@@ -1118,11 +1161,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
 
@@ -1161,11 +1199,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 			error = -ENOENT;
 			break;
 		default:
-			error = -EACCES;
+			error = error < 0 ? error : -EACCES;
 			break;
 	}
 out:
-	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbdae..6c754f70c529 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -113,9 +113,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	DPRINTK("ncp_file_read: enter %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	if (!ncp_conn_valid(NCP_SERVER(inode)))
-		return -EIO;
-
 	pos = *ppos;
 
 	if ((ssize_t) count < 0) {
@@ -192,13 +189,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 
 	DPRINTK("ncp_file_write: enter %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
-	if (!ncp_conn_valid(NCP_SERVER(inode)))
-		return -EIO;
 	if ((ssize_t) count < 0)
 		return -EINVAL;
 	pos = *ppos;
 	if (file->f_flags & O_APPEND) {
-		pos = inode->i_size;
+		pos = i_size_read(inode);
 	}
 
 	if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +259,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 
 	*ppos = pos;
 
-	if (pos > inode->i_size) {
-		inode->i_size = pos;
+	if (pos > i_size_read(inode)) {
+		mutex_lock(&inode->i_mutex);
+		if (pos > i_size_read(inode))
+			i_size_write(inode, pos);
+		mutex_unlock(&inode->i_mutex);
 	}
 	DPRINTK("ncp_file_write: exit %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +279,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
-static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t ret;
-	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
-	unlock_kernel();
-	return ret;
-}
-
 const struct file_operations ncp_file_operations =
 {
-	.llseek 	= ncp_remote_llseek,
+	.llseek		= generic_file_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
 	.unlocked_ioctl	= ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b4de38cf49f5..5f4e58d93fdd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -139,7 +139,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
 		inode->i_mode = nwi->nfs.mode;
 	}
 
-	inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
+	inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
 
 	inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
 	inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +158,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
 		inode->i_mode = server->m.dir_mode;
 		/* for directories dataStreamSize seems to be some
 		   Object ID ??? */
-		inode->i_size = NCP_BLOCK_SIZE;
+		i_size_write(inode, NCP_BLOCK_SIZE);
 	} else {
+		u32 size;
+
 		inode->i_mode = server->m.file_mode;
-		inode->i_size = le32_to_cpu(nwi->dataStreamSize);
+		size = le32_to_cpu(nwi->dataStreamSize);
+		i_size_write(inode, size);
 #ifdef CONFIG_NCPFS_EXTRAS
 		if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 
 		 && (nwi->attributes & aSHARED)) {
 			switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
 				case aHIDDEN:
 					if (server->m.flags & NCP_MOUNT_SYMLINKS) {
-						if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE)
-						 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) {
+						if (/* (size >= NCP_MIN_SYMLINK_SIZE)
+						 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
 							inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
 							NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
 							break;
@@ -208,7 +211,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
 }
 
 /*
- * Fill in the inode based on the ncp_entry_info structure.
+ * Fill in the inode based on the ncp_entry_info structure.  Used only for brand new inodes.
  */
 static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 {
@@ -254,6 +257,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 	if (inode) {
 		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
 
+		inode->i_mapping->backing_dev_info = sb->s_bdi;
 		inode->i_ino = info->ino;
 		ncp_set_attr(inode, info);
 		if (S_ISREG(inode->i_mode)) {
@@ -565,10 +569,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 /*	server->conn_status = 0;	*/
 /*	server->root_dentry = NULL;	*/
 /*	server->root_setuped = 0;	*/
+	mutex_init(&server->root_setup_lock);
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 /*	server->sign_wanted = 0;	*/
 /*	server->sign_active = 0;	*/
 #endif
+	init_rwsem(&server->auth_rwsem);
 	server->auth.auth_type = NCP_AUTH_NONE;
 /*	server->auth.object_name_len = 0;	*/
 /*	server->auth.object_name = NULL;	*/
@@ -593,7 +599,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	server->nls_io = load_nls_default();
 #endif /* CONFIG_NCPFS_NLS */
 
-	server->dentry_ttl = 0;	/* no caching */
+	atomic_set(&server->dentry_ttl, 0);	/* no caching */
 
 	INIT_LIST_HEAD(&server->tx.requests);
 	mutex_init(&server->rcv.creq_mutex);
@@ -658,8 +664,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 				goto out_disconnect;
 			}
 		}
+		ncp_lock_server(server);
 		if (options & 2)
 			server->sign_wanted = 1;
+		ncp_unlock_server(server);
 	}
 	else 
 #endif	/* CONFIG_NCPFS_PACKET_SIGNING */
@@ -720,6 +728,9 @@ out_nls:
 	unload_nls(server->nls_io);
 	unload_nls(server->nls_vol);
 #endif
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
 out_fput2:
 	if (server->info_filp)
 		fput(server->info_filp);
@@ -743,8 +754,6 @@ static void ncp_put_super(struct super_block *sb)
 {
 	struct ncp_server *server = NCP_SBP(sb);
 
-	lock_kernel();
-
 	ncp_lock_server(server);
 	ncp_disconnect(server);
 	ncp_unlock_server(server);
@@ -756,6 +765,9 @@ static void ncp_put_super(struct super_block *sb)
 	unload_nls(server->nls_vol);
 	unload_nls(server->nls_io);
 #endif /* CONFIG_NCPFS_NLS */
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
 
 	if (server->info_filp)
 		fput(server->info_filp);
@@ -771,8 +783,6 @@ static void ncp_put_super(struct super_block *sb)
 	vfree(server->packet);
 	sb->s_fs_info = NULL;
 	kfree(server);
-
-	unlock_kernel();
 }
 
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +861,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	result = -EIO;
 
-	lock_kernel();	
-
 	server = NCP_SERVER(inode);
-	if ((!server) || !ncp_conn_valid(server))
+	if (!server)	/* How this could happen? */
 		goto out;
 
 	/* ageing the dentry to force validation */
@@ -981,8 +989,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 		result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
 				      inode, info_mask, &info);
 		if (result != 0) {
-			result = -EACCES;
-
 			if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
 				/* NetWare seems not to allow this. I
 				   do not know why. So, just tell the
@@ -1005,7 +1011,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 	mark_inode_dirty(inode);
 
 out:
-	unlock_kernel();
+	if (result > 0)
+		result = -EACCES;
 	return result;
 }
 
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 84a8cfc4e38e..c2a1f9a155c3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -35,16 +35,11 @@
 #define NCP_PACKET_SIZE_INTERNAL 65536
 
 static int
-ncp_get_fs_info(struct ncp_server * server, struct file *file,
+ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
 		struct ncp_fs_info __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ncp_fs_info info;
 
-	if (file_permission(file, MAY_WRITE) != 0
-	    && current_uid() != server->m.mounted_uid)
-		return -EACCES;
-
 	if (copy_from_user(&info, arg, sizeof(info)))
 		return -EFAULT;
 
@@ -65,16 +60,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
 }
 
 static int
-ncp_get_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
 		   struct ncp_fs_info_v2 __user * arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ncp_fs_info_v2 info2;
 
-	if (file_permission(file, MAY_WRITE) != 0
-	    && current_uid() != server->m.mounted_uid)
-		return -EACCES;
-
 	if (copy_from_user(&info2, arg, sizeof(info2)))
 		return -EFAULT;
 
@@ -136,16 +126,11 @@ struct compat_ncp_privatedata_ioctl
 #define NCP_IOC_SETPRIVATEDATA_32	_IOR('n', 10, struct compat_ncp_privatedata_ioctl)
 
 static int
-ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
 		   struct compat_ncp_fs_info_v2 __user * arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	struct compat_ncp_fs_info_v2 info2;
 
-	if (file_permission(file, MAY_WRITE) != 0
-	    && current_uid() != server->m.mounted_uid)
-		return -EACCES;
-
 	if (copy_from_user(&info2, arg, sizeof(info2)))
 		return -EFAULT;
 
@@ -182,11 +167,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	struct nls_table *iocharset;
 	struct nls_table *oldset_io;
 	struct nls_table *oldset_cp;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (server->root_setuped)
-		return -EBUSY;
+	int utf8;
+	int err;
 
 	if (copy_from_user(&user, arg, sizeof(user)))
 		return -EFAULT;
@@ -206,28 +188,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	user.iocharset[NCP_IOCSNAME_LEN] = 0;
 	if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
 		iocharset = load_nls_default();
-		NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		utf8 = 0;
 	} else if (!strcmp(user.iocharset, "utf8")) {
 		iocharset = load_nls_default();
-		NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+		utf8 = 1;
 	} else {
 		iocharset = load_nls(user.iocharset);
 		if (!iocharset) {
 			unload_nls(codepage);
 			return -EBADRQC;
 		}
-		NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		utf8 = 0;
 	}
 
-	oldset_cp = server->nls_vol;
-	server->nls_vol = codepage;
-	oldset_io = server->nls_io;
-	server->nls_io = iocharset;
-
+	mutex_lock(&server->root_setup_lock);
+	if (server->root_setuped) {
+		oldset_cp = codepage;
+		oldset_io = iocharset;
+		err = -EBUSY;
+	} else {
+		if (utf8)
+			NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+		else
+			NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		oldset_cp = server->nls_vol;
+		server->nls_vol = codepage;
+		oldset_io = server->nls_io;
+		server->nls_io = iocharset;
+		err = 0;
+	}
+	mutex_unlock(&server->root_setup_lock);
 	unload_nls(oldset_cp);
 	unload_nls(oldset_io);
 
-	return 0;
+	return err;
 }
 
 static int
@@ -237,6 +231,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	int len;
 
 	memset(&user, 0, sizeof(user));
+	mutex_lock(&server->root_setup_lock);
 	if (server->nls_vol && server->nls_vol->charset) {
 		len = strlen(server->nls_vol->charset);
 		if (len > NCP_IOCSNAME_LEN)
@@ -254,6 +249,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 		strncpy(user.iocharset,	server->nls_io->charset, len);
 		user.iocharset[len] = 0;
 	}
+	mutex_unlock(&server->root_setup_lock);
 
 	if (copy_to_user(arg, &user, sizeof(user)))
 		return -EFAULT;
@@ -261,25 +257,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
 
-static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(inode);
 	int result;
 	struct ncp_ioctl_request request;
 	char* bouncebuffer;
 	void __user *argp = (void __user *)arg;
-	uid_t uid = current_uid();
 
 	switch (cmd) {
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_NCPREQUEST_32:
 #endif
 	case NCP_IOC_NCPREQUEST:
-		if (file_permission(filp, MAY_WRITE) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-
 #ifdef CONFIG_COMPAT
 		if (cmd == NCP_IOC_NCPREQUEST_32) {
 			struct compat_ncp_ioctl_request request32;
@@ -314,7 +304,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		server->current_size = request.size;
 		memcpy(server->packet, bouncebuffer, request.size);
 
-		result = ncp_request2(server, request.function, 
+		result = ncp_request2(server, request.function,
 			bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
 		if (result < 0)
 			result = -EIO;
@@ -331,69 +321,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	case NCP_IOC_CONN_LOGGED_IN:
 
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
 		if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
 			return -EINVAL;
+		mutex_lock(&server->root_setup_lock);
 		if (server->root_setuped)
-			return -EBUSY;
-		server->root_setuped = 1;
-		return ncp_conn_logged_in(inode->i_sb);
+			result = -EBUSY;
+		else {
+			result = ncp_conn_logged_in(inode->i_sb);
+			if (result == 0)
+				server->root_setuped = 1;
+		}
+		mutex_unlock(&server->root_setup_lock);
+		return result;
 
 	case NCP_IOC_GET_FS_INFO:
-		return ncp_get_fs_info(server, filp, argp);
+		return ncp_get_fs_info(server, inode, argp);
 
 	case NCP_IOC_GET_FS_INFO_V2:
-		return ncp_get_fs_info_v2(server, filp, argp);
+		return ncp_get_fs_info_v2(server, inode, argp);
 
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_GET_FS_INFO_V2_32:
-		return ncp_get_compat_fs_info_v2(server, filp, argp);
+		return ncp_get_compat_fs_info_v2(server, inode, argp);
 #endif
 	/* we have too many combinations of CONFIG_COMPAT,
 	 * CONFIG_64BIT and CONFIG_UID16, so just handle
 	 * any of the possible ioctls */
 	case NCP_IOC_GETMOUNTUID16:
-	case NCP_IOC_GETMOUNTUID32:
-	case NCP_IOC_GETMOUNTUID64:
-		if (file_permission(filp, MAY_READ) != 0
-			&& uid != server->m.mounted_uid)
-			return -EACCES;
-
-		if (cmd == NCP_IOC_GETMOUNTUID16) {
+		{
 			u16 uid;
+
 			SET_UID(uid, server->m.mounted_uid);
 			if (put_user(uid, (u16 __user *)argp))
 				return -EFAULT;
-		} else if (cmd == NCP_IOC_GETMOUNTUID32) {
-			if (put_user(server->m.mounted_uid,
-						(u32 __user *)argp))
-				return -EFAULT;
-		} else {
-			if (put_user(server->m.mounted_uid,
-						(u64 __user *)argp))
-				return -EFAULT;
+			return 0;
 		}
+	case NCP_IOC_GETMOUNTUID32:
+		if (put_user(server->m.mounted_uid,
+			     (u32 __user *)argp))
+			return -EFAULT;
+		return 0;
+	case NCP_IOC_GETMOUNTUID64:
+		if (put_user(server->m.mounted_uid,
+			     (u64 __user *)argp))
+			return -EFAULT;
 		return 0;
 
 	case NCP_IOC_GETROOT:
 		{
 			struct ncp_setroot_ioctl sr;
 
-			if (file_permission(filp, MAY_READ) != 0
-			    && uid != server->m.mounted_uid)
-				return -EACCES;
-
+			result = -EACCES;
+			mutex_lock(&server->root_setup_lock);
 			if (server->m.mounted_vol[0]) {
 				struct dentry* dentry = inode->i_sb->s_root;
 
 				if (dentry) {
 					struct inode* s_inode = dentry->d_inode;
-				
+
 					if (s_inode) {
 						sr.volNumber = NCP_FINFO(s_inode)->volNumber;
 						sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
 						sr.namespace = server->name_space[sr.volNumber];
+						result = 0;
 					} else
 						DPRINTK("ncpfs: s_root->d_inode==NULL\n");
 				} else
@@ -402,10 +392,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				sr.volNumber = -1;
 				sr.namespace = 0;
 				sr.dirEntNum = 0;
+				result = 0;
 			}
-			if (copy_to_user(argp, &sr, sizeof(sr)))
-				return -EFAULT;
-			return 0;
+			mutex_unlock(&server->root_setup_lock);
+			if (!result && copy_to_user(argp, &sr, sizeof(sr)))
+				result = -EFAULT;
+			return result;
 		}
 
 	case NCP_IOC_SETROOT:
@@ -416,103 +408,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			__le32 dosde;
 			struct dentry* dentry;
 
-			if (!capable(CAP_SYS_ADMIN))
-			{
-				return -EACCES;
-			}
-			if (server->root_setuped) return -EBUSY;
 			if (copy_from_user(&sr, argp, sizeof(sr)))
 				return -EFAULT;
-			if (sr.volNumber < 0) {
-				server->m.mounted_vol[0] = 0;
-				vnum = NCP_NUMBER_OF_VOLUMES;
-				de = 0;
-				dosde = 0;
-			} else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
-				return -EINVAL;
-			} else if (ncp_mount_subdir(server, sr.volNumber,
-						sr.namespace, sr.dirEntNum,
-						&vnum, &de, &dosde)) {
-				return -ENOENT;
-			}
-			
-			dentry = inode->i_sb->s_root;
-			server->root_setuped = 1;
-			if (dentry) {
-				struct inode* s_inode = dentry->d_inode;
-				
-				if (s_inode) {
-					NCP_FINFO(s_inode)->volNumber = vnum;
-					NCP_FINFO(s_inode)->dirEntNum = de;
-					NCP_FINFO(s_inode)->DosDirNum = dosde;
+			mutex_lock(&server->root_setup_lock);
+			if (server->root_setuped)
+				result = -EBUSY;
+			else {
+				if (sr.volNumber < 0) {
+					server->m.mounted_vol[0] = 0;
+					vnum = NCP_NUMBER_OF_VOLUMES;
+					de = 0;
+					dosde = 0;
+					result = 0;
+				} else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
+					result = -EINVAL;
+				} else if (ncp_mount_subdir(server, sr.volNumber,
+							sr.namespace, sr.dirEntNum,
+							&vnum, &de, &dosde)) {
+					result = -ENOENT;
 				} else
-					DPRINTK("ncpfs: s_root->d_inode==NULL\n");
-			} else
-				DPRINTK("ncpfs: s_root==NULL\n");
+					result = 0;
+
+				if (result == 0) {
+					dentry = inode->i_sb->s_root;
+					if (dentry) {
+						struct inode* s_inode = dentry->d_inode;
+
+						if (s_inode) {
+							NCP_FINFO(s_inode)->volNumber = vnum;
+							NCP_FINFO(s_inode)->dirEntNum = de;
+							NCP_FINFO(s_inode)->DosDirNum = dosde;
+							server->root_setuped = 1;
+						} else {
+							DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+							result = -EIO;
+						}
+					} else {
+						DPRINTK("ncpfs: s_root==NULL\n");
+						result = -EIO;
+					}
+				}
+				result = 0;
+			}
+			mutex_unlock(&server->root_setup_lock);
 
-			return 0;
+			return result;
 		}
 
-#ifdef CONFIG_NCPFS_PACKET_SIGNING	
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
 	case NCP_IOC_SIGN_INIT:
-		if (file_permission(filp, MAY_WRITE) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-
-		if (argp) {
-			if (server->sign_wanted)
-			{
-				struct ncp_sign_init sign;
+		{
+			struct ncp_sign_init sign;
 
+			if (argp)
 				if (copy_from_user(&sign, argp, sizeof(sign)))
 					return -EFAULT;
-				memcpy(server->sign_root,sign.sign_root,8);
-				memcpy(server->sign_last,sign.sign_last,16);
-				server->sign_active = 1;
+			ncp_lock_server(server);
+			mutex_lock(&server->rcv.creq_mutex);
+			if (argp) {
+				if (server->sign_wanted) {
+					memcpy(server->sign_root,sign.sign_root,8);
+					memcpy(server->sign_last,sign.sign_last,16);
+					server->sign_active = 1;
+				}
+				/* ignore when signatures not wanted */
+			} else {
+				server->sign_active = 0;
 			}
-			/* ignore when signatures not wanted */
-		} else {
-			server->sign_active = 0;
+			mutex_unlock(&server->rcv.creq_mutex);
+			ncp_unlock_server(server);
+			return 0;
 		}
-		return 0;		
-		
+
         case NCP_IOC_SIGN_WANTED:
-		if (file_permission(filp, MAY_READ) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-		
-                if (put_user(server->sign_wanted, (int __user *)argp))
-			return -EFAULT;
-                return 0;
+		{
+			int state;
+
+			ncp_lock_server(server);
+			state = server->sign_wanted;
+			ncp_unlock_server(server);
+			if (put_user(state, (int __user *)argp))
+				return -EFAULT;
+			return 0;
+		}
 
 	case NCP_IOC_SET_SIGN_WANTED:
 		{
 			int newstate;
 
-			if (file_permission(filp, MAY_WRITE) != 0
-			    && uid != server->m.mounted_uid)
-				return -EACCES;
-
 			/* get only low 8 bits... */
 			if (get_user(newstate, (unsigned char __user *)argp))
 				return -EFAULT;
+			result = 0;
+			ncp_lock_server(server);
 			if (server->sign_active) {
 				/* cannot turn signatures OFF when active */
-				if (!newstate) return -EINVAL;
+				if (!newstate)
+					result = -EINVAL;
 			} else {
 				server->sign_wanted = newstate != 0;
 			}
-			return 0;
+			ncp_unlock_server(server);
+			return result;
 		}
 
 #endif /* CONFIG_NCPFS_PACKET_SIGNING */
 
 #ifdef CONFIG_NCPFS_IOCTL_LOCKING
 	case NCP_IOC_LOCKUNLOCK:
-		if (file_permission(filp, MAY_WRITE) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-
 		{
 			struct ncp_lock_ioctl	 rqdata;
 
@@ -541,16 +544,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			{
 				return result;
 			}
-			result = -EIO;
-			if (!ncp_conn_valid(server))
-				goto outrel;
 			result = -EISDIR;
 			if (!S_ISREG(inode->i_mode))
 				goto outrel;
 			if (rqdata.cmd == NCP_LOCK_CLEAR)
 			{
 				result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
-							NCP_FINFO(inode)->file_handle, 
+							NCP_FINFO(inode)->file_handle,
 							rqdata.offset,
 							rqdata.length);
 				if (result > 0) result = 0;	/* no such lock */
@@ -573,7 +573,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 							rqdata.timeout);
 				if (result > 0) result = -EAGAIN;
 			}
-outrel:			
+outrel:
 			ncp_inode_close(inode);
 			return result;
 		}
@@ -581,60 +581,62 @@ outrel:
 
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_GETOBJECTNAME_32:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct compat_ncp_objectname_ioctl user;
 			size_t outl;
 
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+			down_read(&server->auth_rwsem);
 			user.auth_type = server->auth.auth_type;
 			outl = user.object_name_len;
 			user.object_name_len = server->auth.object_name_len;
 			if (outl > user.object_name_len)
 				outl = user.object_name_len;
+			result = 0;
 			if (outl) {
 				if (copy_to_user(compat_ptr(user.object_name),
 						 server->auth.object_name,
-						 outl)) return -EFAULT;
+						 outl))
+					result = -EFAULT;
 			}
-			if (copy_to_user(argp, &user, sizeof(user)))
-				return -EFAULT;
-			return 0;
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
 		}
 #endif
 
 	case NCP_IOC_GETOBJECTNAME:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_objectname_ioctl user;
 			size_t outl;
 
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+			down_read(&server->auth_rwsem);
 			user.auth_type = server->auth.auth_type;
 			outl = user.object_name_len;
 			user.object_name_len = server->auth.object_name_len;
 			if (outl > user.object_name_len)
 				outl = user.object_name_len;
+			result = 0;
 			if (outl) {
 				if (copy_to_user(user.object_name,
 						 server->auth.object_name,
-						 outl)) return -EFAULT;
+						 outl))
+					result = -EFAULT;
 			}
-			if (copy_to_user(argp, &user, sizeof(user)))
-				return -EFAULT;
-			return 0;
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
 		}
 
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_SETOBJECTNAME_32:
 #endif
 	case NCP_IOC_SETOBJECTNAME:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_objectname_ioctl user;
 			void* newname;
@@ -666,9 +668,7 @@ outrel:
 			} else {
 				newname = NULL;
 			}
-			/* enter critical section */
-			/* maybe that kfree can sleep so do that this way */
-			/* it is at least more SMP friendly (in future...) */
+			down_write(&server->auth_rwsem);
 			oldname = server->auth.object_name;
 			oldnamelen = server->auth.object_name_len;
 			oldprivate = server->priv.data;
@@ -678,7 +678,7 @@ outrel:
 			server->auth.object_name = newname;
 			server->priv.len = 0;
 			server->priv.data = NULL;
-			/* leave critical section */
+			up_write(&server->auth_rwsem);
 			kfree(oldprivate);
 			kfree(oldname);
 			return 0;
@@ -688,8 +688,6 @@ outrel:
 	case NCP_IOC_GETPRIVATEDATA_32:
 #endif
 	case NCP_IOC_GETPRIVATEDATA:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_privatedata_ioctl user;
 			size_t outl;
@@ -706,14 +704,20 @@ outrel:
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
 
+			down_read(&server->auth_rwsem);
 			outl = user.len;
 			user.len = server->priv.len;
 			if (outl > user.len) outl = user.len;
+			result = 0;
 			if (outl) {
 				if (copy_to_user(user.data,
 						 server->priv.data,
-						 outl)) return -EFAULT;
+						 outl))
+					result = -EFAULT;
 			}
+			up_read(&server->auth_rwsem);
+			if (result)
+				return result;
 #ifdef CONFIG_COMPAT
 			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
 				struct compat_ncp_privatedata_ioctl user32;
@@ -733,8 +737,6 @@ outrel:
 	case NCP_IOC_SETPRIVATEDATA_32:
 #endif
 	case NCP_IOC_SETPRIVATEDATA:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_privatedata_ioctl user;
 			void* new;
@@ -762,12 +764,12 @@ outrel:
 			} else {
 				new = NULL;
 			}
-			/* enter critical section */
+			down_write(&server->auth_rwsem);
 			old = server->priv.data;
 			oldlen = server->priv.len;
 			server->priv.len = user.len;
 			server->priv.data = new;
-			/* leave critical section */
+			up_write(&server->auth_rwsem);
 			kfree(old);
 			return 0;
 		}
@@ -775,17 +777,13 @@ outrel:
 #ifdef CONFIG_NCPFS_NLS
 	case NCP_IOC_SETCHARSETS:
 		return ncp_set_charsets(server, argp);
-		
+
 	case NCP_IOC_GETCHARSETS:
 		return ncp_get_charsets(server, argp);
 
 #endif /* CONFIG_NCPFS_NLS */
 
 	case NCP_IOC_SETDENTRYTTL:
-		if (file_permission(filp, MAY_WRITE) != 0 &&
-		    uid != server->m.mounted_uid)
-			return -EACCES;
-
 		{
 			u_int32_t user;
 
@@ -795,13 +793,13 @@ outrel:
 			if (user > 20000)
 				return -EINVAL;
 			user = (user * HZ) / 1000;
-			server->dentry_ttl = user;
+			atomic_set(&server->dentry_ttl, user);
 			return 0;
 		}
-		
+
 	case NCP_IOC_GETDENTRYTTL:
 		{
-			u_int32_t user = (server->dentry_ttl * 1000) / HZ;
+			u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
 			if (copy_to_user(argp, &user, sizeof(user)))
 				return -EFAULT;
 			return 0;
@@ -811,59 +809,103 @@ outrel:
 	return -EINVAL;
 }
 
-static int ncp_ioctl_need_write(unsigned int cmd)
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ncp_server *server = NCP_SERVER(inode);
+	uid_t uid = current_uid();
+	int need_drop_write = 0;
+	long ret;
+
 	switch (cmd) {
-	case NCP_IOC_GET_FS_INFO:
-	case NCP_IOC_GET_FS_INFO_V2:
-	case NCP_IOC_NCPREQUEST:
-	case NCP_IOC_SETDENTRYTTL:
-	case NCP_IOC_SIGN_INIT:
-	case NCP_IOC_LOCKUNLOCK:
-	case NCP_IOC_SET_SIGN_WANTED:
-		return 1;
-	case NCP_IOC_GETOBJECTNAME:
-	case NCP_IOC_SETOBJECTNAME:
-	case NCP_IOC_GETPRIVATEDATA:
-	case NCP_IOC_SETPRIVATEDATA:
 	case NCP_IOC_SETCHARSETS:
-	case NCP_IOC_GETCHARSETS:
 	case NCP_IOC_CONN_LOGGED_IN:
-	case NCP_IOC_GETDENTRYTTL:
-	case NCP_IOC_GETMOUNTUID2:
-	case NCP_IOC_SIGN_WANTED:
-	case NCP_IOC_GETROOT:
 	case NCP_IOC_SETROOT:
-		return 0;
-	default:
-		/* unknown IOCTL command, assume write */
-		return 1;
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EACCES;
+			goto out;
+		}
+		break;
 	}
-}
-
-long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	long ret;
-
-	lock_kernel();
-	if (ncp_ioctl_need_write(cmd)) {
+	if (server->m.mounted_uid != uid) {
+		switch (cmd) {
 		/*
-		 * inside the ioctl(), any failures which
-		 * are because of file_permission() are
-		 * -EACCESS, so it seems consistent to keep
-		 *  that here.
+		 * Only mount owner can issue these ioctls.  Information
+		 * necessary to authenticate to other NDS servers are
+		 * stored here.
 		 */
-		if (mnt_want_write(filp->f_path.mnt)) {
+		case NCP_IOC_GETOBJECTNAME:
+		case NCP_IOC_SETOBJECTNAME:
+		case NCP_IOC_GETPRIVATEDATA:
+		case NCP_IOC_SETPRIVATEDATA:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GETOBJECTNAME_32:
+		case NCP_IOC_SETOBJECTNAME_32:
+		case NCP_IOC_GETPRIVATEDATA_32:
+		case NCP_IOC_SETPRIVATEDATA_32:
+#endif
 			ret = -EACCES;
 			goto out;
+		/*
+		 * These require write access on the inode if user id
+		 * does not match.  Note that they do not write to the
+		 * file...  But old code did mnt_want_write, so I keep
+		 * it as is.  Of course not for mountpoint owner, as
+		 * that breaks read-only mounts altogether as ncpmount
+		 * needs working NCP_IOC_NCPREQUEST and
+		 * NCP_IOC_GET_FS_INFO.  Some of these codes (setdentryttl,
+		 * signinit, setsignwanted) should be probably restricted
+		 * to owner only, or even more to CAP_SYS_ADMIN).
+		 */
+		case NCP_IOC_GET_FS_INFO:
+		case NCP_IOC_GET_FS_INFO_V2:
+		case NCP_IOC_NCPREQUEST:
+		case NCP_IOC_SETDENTRYTTL:
+		case NCP_IOC_SIGN_INIT:
+		case NCP_IOC_LOCKUNLOCK:
+		case NCP_IOC_SET_SIGN_WANTED:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GET_FS_INFO_V2_32:
+		case NCP_IOC_NCPREQUEST_32:
+#endif
+			ret = mnt_want_write_file(filp);
+			if (ret)
+				goto out;
+			need_drop_write = 1;
+			ret = inode_permission(inode, MAY_WRITE);
+			if (ret)
+				goto outDropWrite;
+			break;
+		/*
+		 * Read access required.
+		 */
+		case NCP_IOC_GETMOUNTUID16:
+		case NCP_IOC_GETMOUNTUID32:
+		case NCP_IOC_GETMOUNTUID64:
+		case NCP_IOC_GETROOT:
+		case NCP_IOC_SIGN_WANTED:
+			ret = inode_permission(inode, MAY_READ);
+			if (ret)
+				goto out;
+			break;
+		/*
+		 * Anybody can read these.
+		 */
+		case NCP_IOC_GETCHARSETS:
+		case NCP_IOC_GETDENTRYTTL:
+		default:
+		/* Three codes below are protected by CAP_SYS_ADMIN above. */
+		case NCP_IOC_SETCHARSETS:
+		case NCP_IOC_CONN_LOGGED_IN:
+		case NCP_IOC_SETROOT:
+			break;
 		}
 	}
-	ret = __ncp_ioctl(filp, cmd, arg);
-	if (ncp_ioctl_need_write(cmd))
+	ret = __ncp_ioctl(inode, cmd, arg);
+outDropWrite:
+	if (need_drop_write)
 		mnt_drop_write(filp->f_path.mnt);
-
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -872,10 +914,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	long ret;
 
-	lock_kernel();
 	arg = (unsigned long) compat_ptr(arg);
 	ret = ncp_ioctl(file, cmd, arg);
-	unlock_kernel();
 	return ret;
 }
 #endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a5970..a95615a0b6ac 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
 	return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
 }
 
-static inline u8 BVAL(void *data)
+static inline u8 BVAL(const void *data)
 {
-	return *(u8 *)data;
+	return *(const u8 *)data;
 }
 
 static u8 ncp_reply_byte(struct ncp_server *server, int offset)
 {
-	return *(u8 *)ncp_reply_data(server, offset);
+	return *(const u8 *)ncp_reply_data(server, offset);
 }
 
-static inline u16 WVAL_LH(void *data)
+static inline u16 WVAL_LH(const void *data)
 {
 	return get_unaligned_le16(data);
 }
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
 	return get_unaligned_be16(ncp_reply_data(server, offset));
 }
 
-static inline u32 DVAL_LH(void *data)
+static inline u32 DVAL_LH(const void *data)
 {
 	return get_unaligned_le32(data);
 }
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
 	return result;
 }
 
-void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
+void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
 {
-	__u8 *name_len;
+	const __u8 *name_len;
 	const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
 
 	memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
 }
 
 #ifdef CONFIG_NCPFS_NFS_NS
-static inline void ncp_extract_nfs_info(unsigned char *structure,
+static inline void ncp_extract_nfs_info(const unsigned char *structure,
 				 struct nw_nfs_info *target)
 {
 	target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
  * Returns information for a (one-component) name relative to
  * the specified directory.
  */
-int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path,
+int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
 			struct nw_info_struct *target)
 {
 	__u8  volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
 #ifdef CONFIG_NCPFS_NFS_NS
 static int
 ncp_obtain_DOS_dir_base(struct ncp_server *server,
-		__u8 volnum, __le32 dirent,
-		char *path, /* At most 1 component */
+		__u8 ns, __u8 volnum, __le32 dirent,
+		const char *path, /* At most 1 component */
 		__le32 *DOS_dir_base)
 {
 	int result;
 
 	ncp_init_request(server);
 	ncp_add_byte(server, 6); /* subfunction */
-	ncp_add_byte(server, server->name_space[volnum]);
-	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, ns);
+	ncp_add_byte(server, ns);
 	ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
 	ncp_add_dword(server, RIM_DIRECTORY);
 	ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
 #endif	/* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
 }
 
+int
+ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
+{
+	int ns = ncp_get_known_namespace(server, volume);
+
+	if (ret_ns)
+		*ret_ns = ns;
+
+	DPRINTK("lookup_vol: namespace[%d] = %d\n",
+		volume, server->name_space[volume]);
+
+	if (server->name_space[volume] == ns)
+		return 0;
+	server->name_space[volume] = ns;
+	return 1;
+}
+
 static int
 ncp_ObtainSpecificDirBase(struct ncp_server *server,
 		__u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
-		char *path, /* At most 1 component */
+		const char *path, /* At most 1 component */
 		__le32 *dirEntNum, __le32 *DosDirNum)
 {
 	int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
 {
 	int dstNS;
 	int result;
-	
-	dstNS = ncp_get_known_namespace(server, volNumber);
+
+	ncp_update_known_namespace(server, volNumber, &dstNS);
 	if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 
 				      dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
 	{
 		return result;
 	}
-	server->name_space[volNumber] = dstNS;
 	*volume = volNumber;
 	server->m.mounted_vol[1] = 0;
 	server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
 }
 
 int 
-ncp_get_volume_root(struct ncp_server *server, const char *volname,
-		    __u32* volume, __le32* dirent, __le32* dosdirent)
+ncp_get_volume_root(struct ncp_server *server,
+		    const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
 {
 	int result;
-	__u8 volnum;
 
 	DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
 
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
 		return result;
 	}
 	*dirent = *dosdirent = ncp_reply_dword(server, 4);
-	volnum = ncp_reply_byte(server, 8);
+	*volume = ncp_reply_byte(server, 8);
 	ncp_unlock_server(server);
-	*volume = volnum;
-
-	server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
-
-	DPRINTK("lookup_vol: namespace[%d] = %d\n",
-		volnum, server->name_space[volnum]);
-
 	return 0;
 }
 
 int
-ncp_lookup_volume(struct ncp_server *server, const char *volname,
-		  struct nw_info_struct *target)
+ncp_lookup_volume(struct ncp_server *server,
+		  const char *volname, struct nw_info_struct *target)
 {
 	int result;
 
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
 	if (result) {
 		return result;
 	}
+	ncp_update_known_namespace(server, target->volNumber, NULL);
 	target->nameLen = strlen(volname);
 	memcpy(target->entryName, volname, target->nameLen+1);
 	target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 {
 	int result = 0;
 
+	ncp_init_request(server);
 	if (server->name_space[volnum] == NW_NS_NFS) {
-		ncp_init_request(server);
 		ncp_add_byte(server, 25);	/* subfunction */
 		ncp_add_byte(server, server->name_space[volnum]);
 		ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 		ncp_add_dword_lh(server, 1);	/* nlinks */
 		ncp_add_dword_lh(server, rdev);
 		result = ncp_request(server, 87);
-		ncp_unlock_server(server);
 	}
+	ncp_unlock_server(server);
 	return result;
 }
 #endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 static int
 ncp_DeleteNSEntry(struct ncp_server *server,
 		  __u8 have_dir_base, __u8 volnum, __le32 dirent,
-		  char* name, __u8 ns, __le16 attr)
+		  const char* name, __u8 ns, __le16 attr)
 {
 	int result;
 
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
 
 int
 ncp_del_file_or_subdir(struct ncp_server *server,
-		       struct inode *dir, char *name)
+		       struct inode *dir, const char *name)
 {
 	__u8  volnum = NCP_FINFO(dir)->volNumber;
 	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int name_space;
 
+	name_space = server->name_space[volnum];
 #ifdef CONFIG_NCPFS_NFS_NS
-	if (server->name_space[volnum]==NW_NS_NFS)
+	if (name_space == NW_NS_NFS)
  	{
  		int result;
  
- 		result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent);
+		result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
  		if (result) return result;
- 		return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006));
+		name = NULL;
+		name_space = NW_NS_DOS;
  	}
- 	else
 #endif	/* CONFIG_NCPFS_NFS_NS */
- 		return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006));
+	return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
 }
 
 static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
 /* If both dir and name are NULL, then in target there's already a
    looked-up entry that wants to be opened. */
 int ncp_open_create_file_or_subdir(struct ncp_server *server,
-				   struct inode *dir, char *name,
+				   struct inode *dir, const char *name,
 				   int open_create_mode,
 				   __le32 create_attributes,
 				   __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
 
 static int
 ncp_RenameNSEntry(struct ncp_server *server,
-		  struct inode *old_dir, char *old_name, __le16 old_type,
-		  struct inode *new_dir, char *new_name)
+		  struct inode *old_dir, const char *old_name, __le16 old_type,
+		  struct inode *new_dir, const char *new_name)
 {
 	int result = -EINVAL;
 
@@ -929,8 +940,8 @@ out:
 }
 
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-				struct inode *old_dir, char *old_name,
-				struct inode *new_dir, char *new_name)
+				struct inode *old_dir, const char *old_name,
+				struct inode *new_dir, const char *new_name)
 {
         int result;
         __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
 ncp_read_kernel(struct ncp_server *server, const char *file_id,
 	     __u32 offset, __u16 to_read, char *target, int *bytes_read)
 {
-	char *source;
+	const char *source;
 	int result;
 
 	ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57dc..3c57eca634ce 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -65,10 +65,11 @@ static inline void ncp_inode_close(struct inode *inode) {
 	atomic_dec(&NCP_FINFO(inode)->opened);
 }
 
-void ncp_extract_file_info(void* src, struct nw_info_struct* target);
-int ncp_obtain_info(struct ncp_server *server, struct inode *, char *,
+void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
+int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
 		struct nw_info_struct *target);
 int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
+int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
 int ncp_get_volume_root(struct ncp_server *server, const char *volname,
 			__u32 *volume, __le32 *dirent, __le32 *dosdirent);
 int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +81,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
 			__u32 mode, __u32 rdev);
 
 int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
-int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *);
-int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *,
+int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
+int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
 				int, __le32, __le16, struct ncp_entry_info *);
 
 int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +94,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
 			   char** rbuf, size_t* rsize);
 
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-			      struct inode *, char *, struct inode *, char *);
+			      struct inode *, const char *, struct inode *, const char *);
 
 
 int
@@ -170,13 +171,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
 #endif /* CONFIG_NCPFS_NLS */
 
 #define NCP_GET_AGE(dentry)	(jiffies - (dentry)->d_time)
-#define NCP_MAX_AGE(server)	((server)->dentry_ttl)
+#define NCP_MAX_AGE(server)	atomic_read(&(server)->dentry_ttl)
 #define NCP_TEST_AGE(server,dentry)	(NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
 
 static inline void
 ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
 {
-	dentry->d_time = jiffies - server->dentry_ttl;
+	dentry->d_time = jiffies - NCP_MAX_AGE(server);
 }
 
 static inline void
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6cf..d8b2d7e6910b 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -15,21 +15,21 @@
 
 /* i386: 32-bit, little endian, handles mis-alignment */
 #ifdef __i386__
-#define GET_LE32(p) (*(int *)(p))
+#define GET_LE32(p) (*(const int *)(p))
 #define PUT_LE32(p,v) { *(int *)(p)=v; }
 #else
 /* from include/ncplib.h */
-#define BVAL(buf,pos) (((__u8 *)(buf))[pos])
+#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
 #define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
-#define BSET(buf,pos,val) (BVAL(buf,pos) = (val))
+#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
 
 static inline __u16
-WVAL_LH(__u8 * buf, int pos)
+WVAL_LH(const __u8 * buf, int pos)
 {
 	return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
 }
 static inline __u32
-DVAL_LH(__u8 * buf, int pos)
+DVAL_LH(const __u8 * buf, int pos)
 {
 	return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
 }
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6e..668bd267346e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
 		return -EIO;
 	}
 	if (!ncp_conn_valid(server)) {
-		printk(KERN_ERR "ncpfs: Connection invalid!\n");
 		return -EIO;
 	}
 	{
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index 4522aed00906..ef663061d5ac 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -241,34 +241,6 @@ int ncp_mmap(struct file *, struct vm_area_struct *);
 /* linux/fs/ncpfs/ncplib_kernel.c */
 int ncp_make_closed(struct inode *);
 
-#define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
-
-static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
-{
-#ifdef CONFIG_NCPFS_SMALLDOS
-	int ns = ncp_namespace(i);
-
-	if ((ns == NW_NS_DOS)
-#ifdef CONFIG_NCPFS_OS2_NS
-		|| ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
-#endif /* CONFIG_NCPFS_OS2_NS */
-				)
-		return 0;
-#endif /* CONFIG_NCPFS_SMALLDOS */
-	return 1;
-}
-
-#define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
-
-static inline int ncp_case_sensitive(struct inode *i)
-{
-#ifdef CONFIG_NCPFS_NFS_NS
-	return ncp_namespace(i) == NW_NS_NFS;
-#else
-	return 0;
-#endif	/* CONFIG_NCPFS_NFS_NS */
-} 
-
 #endif				/* __KERNEL__ */
 
 #endif				/* _LINUX_NCP_FS_H */
diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h
index 8da05bc098ca..d64b0e894336 100644
--- a/include/linux/ncp_fs_sb.h
+++ b/include/linux/ncp_fs_sb.h
@@ -62,6 +62,7 @@ struct ncp_server {
 	int ncp_reply_size;
 
 	int root_setuped;
+	struct mutex root_setup_lock;
 
 	/* info for packet signing */
 	int sign_wanted;	/* 1=Server needs signed packets */
@@ -81,13 +82,14 @@ struct ncp_server {
 		size_t	len;
 		void*	data;
 	} priv;
+	struct rw_semaphore auth_rwsem;
 
 	/* nls info: codepage for volume and charset for I/O */
 	struct nls_table *nls_vol;
 	struct nls_table *nls_io;
 
 	/* maximum age in jiffies */
-	int dentry_ttl;
+	atomic_t dentry_ttl;
 
 	/* miscellaneous */
 	unsigned int flags;
-- 
cgit v1.2.3


From b89f432133851a01c0d28822f11cbdcc15781a75 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 18 Sep 2010 15:09:31 +0200
Subject: fs/locks.c: prepare for BKL removal

This prepares the removal of the big kernel lock from the
file locking code. We still use the BKL as long as fs/lockd
uses it and ceph might sleep, but we can flip the definition
to a private spinlock as soon as that's done.
All users outside of fs/lockd get converted to use
lock_flocks() instead of lock_kernel() where appropriate.

Based on an earlier patch to use a spinlock from Matthew
Wilcox, who has attempted this a few times before, the
earliest patch from over 10 years ago turned it into
a semaphore, which ended up being slower than the BKL
and was subsequently reverted.

Someone should do some serious performance testing when
this becomes a spinlock, since this has caused problems
before. Using a spinlock should be at least as good
as the BKL in theory, but who knows...

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Sage Weil <sage@newdream.net>
Cc: linux-kernel@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
---
 fs/afs/flock.c      |   5 +--
 fs/cifs/cifsfs.c    |   4 +-
 fs/gfs2/file.c      |   2 +
 fs/locks.c          | 112 +++++++++++++++++++++++++++++++---------------------
 fs/nfs/delegation.c |  10 ++---
 fs/nfs/nfs4state.c  |  10 ++---
 fs/nfsd/nfs4state.c |   6 +--
 include/linux/fs.h  |  14 +++++--
 8 files changed, 97 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325eb..757d664575dd 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/smp_lock.h>
 #include "internal.h"
 
 #define AFS_LOCK_GRANTED	0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 
-	lock_kernel();
+	lock_flocks();
 
 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
 	afs_vnode_fetch_status(vnode, NULL, key);
 
 error:
-	unlock_kernel();
+	unlock_flocks();
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 4e273f7793f6..50208c15309a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -562,8 +562,8 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-	/* note that this is called by vfs setlease with the BKL held
-	   although I doubt that BKL is needed here in cifs */
+	/* note that this is called by vfs setlease with lock_flocks held
+	   to protect *lease from going away */
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	if (!(S_ISREG(inode->i_mode)))
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..8fcfefb96077 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -620,6 +620,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  * cluster; until we do, disable leases (by just returning -EINVAL),
  * unless the administrator has requested purely local locking.
  *
+ * Locking: called under lock_flocks
+ *
  * Returns: errno
  */
 
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc048..8b2b6ad56a09 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -143,6 +143,22 @@ int lease_break_time = 45;
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
 
+/*
+ * Protects the two list heads above, plus the inode->i_flock list
+ * FIXME: should use a spinlock, once lockd and ceph are ready.
+ */
+void lock_flocks(void)
+{
+	lock_kernel();
+}
+EXPORT_SYMBOL_GPL(lock_flocks);
+
+void unlock_flocks(void)
+{
+	unlock_kernel();
+}
+EXPORT_SYMBOL_GPL(unlock_flocks);
+
 static struct kmem_cache *filelock_cache __read_mostly;
 
 /* Allocate an empty lock structure. */
@@ -511,9 +527,9 @@ static void __locks_delete_block(struct file_lock *waiter)
  */
 static void locks_delete_block(struct file_lock *waiter)
 {
-	lock_kernel();
+	lock_flocks();
 	__locks_delete_block(waiter);
-	unlock_kernel();
+	unlock_flocks();
 }
 
 /* Insert waiter into blocker's block list.
@@ -644,7 +660,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
 
-	lock_kernel();
+	lock_flocks();
 	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
 			continue;
@@ -657,7 +673,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 			fl->fl_pid = pid_vnr(cfl->fl_nspid);
 	} else
 		fl->fl_type = F_UNLCK;
-	unlock_kernel();
+	unlock_flocks();
 	return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +746,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	int error = 0;
 	int found = 0;
 
-	lock_kernel();
-	if (request->fl_flags & FL_ACCESS)
-		goto find_conflict;
-
-	if (request->fl_type != F_UNLCK) {
-		error = -ENOMEM;
+	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
 		new_fl = locks_alloc_lock();
-		if (new_fl == NULL)
-			goto out;
-		error = 0;
+		if (!new_fl)
+			return -ENOMEM;
 	}
 
+	lock_flocks();
+	if (request->fl_flags & FL_ACCESS)
+		goto find_conflict;
+
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
 		if (IS_POSIX(fl))
@@ -767,8 +781,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * If a higher-priority process was blocked on the old file lock,
 	 * give it the opportunity to lock the file.
 	 */
-	if (found)
+	if (found) {
+		unlock_flocks();
 		cond_resched();
+		lock_flocks();
+	}
 
 find_conflict:
 	for_each_lock(inode, before) {
@@ -794,7 +811,7 @@ find_conflict:
 	error = 0;
 
 out:
-	unlock_kernel();
+	unlock_flocks();
 	if (new_fl)
 		locks_free_lock(new_fl);
 	return error;
@@ -823,7 +840,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		new_fl2 = locks_alloc_lock();
 	}
 
-	lock_kernel();
+	lock_flocks();
 	if (request->fl_type != F_UNLCK) {
 		for_each_lock(inode, before) {
 			fl = *before;
@@ -991,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		locks_wake_up_blocks(left);
 	}
  out:
-	unlock_kernel();
+	unlock_flocks();
 	/*
 	 * Free any unused locks.
 	 */
@@ -1066,14 +1083,14 @@ int locks_mandatory_locked(struct inode *inode)
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!IS_POSIX(fl))
 			continue;
 		if (fl->fl_owner != owner)
 			break;
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return fl ? -EAGAIN : 0;
 }
 
@@ -1186,7 +1203,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 
 	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
 
-	lock_kernel();
+	lock_flocks();
 
 	time_out_leases(inode);
 
@@ -1247,8 +1264,10 @@ restart:
 			break_time++;
 	}
 	locks_insert_block(flock, new_fl);
+	unlock_flocks();
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
+	lock_flocks();
 	__locks_delete_block(new_fl);
 	if (error >= 0) {
 		if (error == 0)
@@ -1263,7 +1282,7 @@ restart:
 	}
 
 out:
-	unlock_kernel();
+	unlock_flocks();
 	if (!IS_ERR(new_fl))
 		locks_free_lock(new_fl);
 	return error;
@@ -1319,7 +1338,7 @@ int fcntl_getlease(struct file *filp)
 	struct file_lock *fl;
 	int type = F_UNLCK;
 
-	lock_kernel();
+	lock_flocks();
 	time_out_leases(filp->f_path.dentry->d_inode);
 	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
@@ -1328,7 +1347,7 @@ int fcntl_getlease(struct file *filp)
 			break;
 		}
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return type;
 }
 
@@ -1341,7 +1360,7 @@ int fcntl_getlease(struct file *filp)
  *	The (input) flp->fl_lmops->fl_break function is required
  *	by break_lease().
  *
- *	Called with kernel lock held.
+ *	Called with file_lock_lock held.
  */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
@@ -1436,7 +1455,15 @@ out:
 }
 EXPORT_SYMBOL(generic_setlease);
 
- /**
+static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+	if (filp->f_op && filp->f_op->setlease)
+		return filp->f_op->setlease(filp, arg, lease);
+	else
+		return generic_setlease(filp, arg, lease);
+}
+
+/**
  *	vfs_setlease        -       sets a lease on an open file
  *	@filp: file pointer
  *	@arg: type of lease to obtain
@@ -1467,12 +1494,9 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
 	int error;
 
-	lock_kernel();
-	if (filp->f_op && filp->f_op->setlease)
-		error = filp->f_op->setlease(filp, arg, lease);
-	else
-		error = generic_setlease(filp, arg, lease);
-	unlock_kernel();
+	lock_flocks();
+	error = __vfs_setlease(filp, arg, lease);
+	unlock_flocks();
 
 	return error;
 }
@@ -1499,9 +1523,9 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 	if (error)
 		return error;
 
-	lock_kernel();
+	lock_flocks();
 
-	error = vfs_setlease(filp, arg, &flp);
+	error = __vfs_setlease(filp, arg, &flp);
 	if (error || arg == F_UNLCK)
 		goto out_unlock;
 
@@ -1516,7 +1540,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 
 	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 out_unlock:
-	unlock_kernel();
+	unlock_flocks();
 	return error;
 }
 
@@ -2020,7 +2044,7 @@ void locks_remove_flock(struct file *filp)
 			fl.fl_ops->fl_release_private(&fl);
 	}
 
-	lock_kernel();
+	lock_flocks();
 	before = &inode->i_flock;
 
 	while ((fl = *before) != NULL) {
@@ -2038,7 +2062,7 @@ void locks_remove_flock(struct file *filp)
  		}
 		before = &fl->fl_next;
 	}
-	unlock_kernel();
+	unlock_flocks();
 }
 
 /**
@@ -2053,12 +2077,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
 	int status = 0;
 
-	lock_kernel();
+	lock_flocks();
 	if (waiter->fl_next)
 		__locks_delete_block(waiter);
 	else
 		status = -ENOENT;
-	unlock_kernel();
+	unlock_flocks();
 	return status;
 }
 
@@ -2172,7 +2196,7 @@ static int locks_show(struct seq_file *f, void *v)
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-	lock_kernel();
+	lock_flocks();
 	f->private = (void *)1;
 	return seq_list_start(&file_lock_list, *pos);
 }
@@ -2184,7 +2208,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 
 static void locks_stop(struct seq_file *f, void *v)
 {
-	unlock_kernel();
+	unlock_flocks();
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2231,7 +2255,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2272,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return result;
 }
 
@@ -2271,7 +2295,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2310,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return result;
 }
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9c3c43cea1d..232a7eead33a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,20 +71,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 	if (inode->i_flock == NULL)
 		goto out;
 
-	/* Protect inode->i_flock using the BKL */
-	lock_kernel();
+	/* Protect inode->i_flock using the file locks lock */
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
-		unlock_kernel();
+		unlock_flocks();
 		status = nfs4_lock_delegation_recall(state, fl);
 		if (status < 0)
 			goto out;
-		lock_kernel();
+		lock_flocks();
 	}
-	unlock_kernel();
+	unlock_flocks();
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..96524c5dca6b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,7 +40,7 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include <linux/kthread.h>
@@ -970,13 +970,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
 	/* Protect inode->i_flock using the BKL */
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
-		unlock_kernel();
+		unlock_flocks();
 		status = ops->recover_lock(state, fl);
 		switch (status) {
 			case 0:
@@ -1003,9 +1003,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				status = 0;
 		}
-		lock_kernel();
+		lock_flocks();
 	}
-	unlock_kernel();
+	unlock_flocks();
 out:
 	up_write(&nfsi->rwsem);
 	return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..a7292fcf7718 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
 */
 
 #include <linux/file.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
@@ -3895,7 +3895,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
 	struct inode *inode = filp->fi_inode;
 	int status = 0;
 
-	lock_kernel();
+	lock_flocks();
 	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
 		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
 			status = 1;
@@ -3903,7 +3903,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
 		}
 	}
 out:
-	unlock_kernel();
+	unlock_flocks();
 	return status;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b7..180325268237 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1093,10 +1093,6 @@ struct file_lock {
 
 #include <linux/fcntl.h>
 
-/* temporary stubs for BKL removal */
-#define lock_flocks() lock_kernel()
-#define unlock_flocks() unlock_kernel()
-
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
 #ifdef CONFIG_FILE_LOCKING
@@ -1135,6 +1131,8 @@ extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+extern void lock_flocks(void);
+extern void unlock_flocks(void);
 #else /* !CONFIG_FILE_LOCKING */
 static inline int fcntl_getlk(struct file *file, struct flock __user *user)
 {
@@ -1277,6 +1275,14 @@ static inline int lock_may_write(struct inode *inode, loff_t start,
 	return 1;
 }
 
+static inline void lock_flocks(void)
+{
+}
+
+static inline void unlock_flocks(void)
+{
+}
+
 #endif /* !CONFIG_FILE_LOCKING */
 
 
-- 
cgit v1.2.3


From 2a4df5d33202e99c015928bf2a2dfd8ad03e53bc Mon Sep 17 00:00:00 2001
From: Petr Vandrovec <petr@vandrovec.name>
Date: Wed, 29 Sep 2010 14:39:11 +0200
Subject: ncpfs: Lock socket in ncpfs while setting its callbacks

Otherwise partially updated pointers could be seen if
pointer update is not atomic.

Signed-off-by: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ncpfs/inode.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 5f4e58d93fdd..985fabb26aca 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -303,10 +303,12 @@ ncp_evict_inode(struct inode *inode)
 
 static void ncp_stop_tasks(struct ncp_server *server) {
 	struct sock* sk = server->ncp_sock->sk;
-		
+
+	lock_sock(sk);
 	sk->sk_error_report = server->error_report;
 	sk->sk_data_ready   = server->data_ready;
 	sk->sk_write_space  = server->write_space;
+	release_sock(sk);
 	del_timer_sync(&server->timeout_tm);
 	flush_scheduled_work();
 }
@@ -605,10 +607,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	mutex_init(&server->rcv.creq_mutex);
 	server->tx.creq		= NULL;
 	server->rcv.creq	= NULL;
-	server->data_ready	= sock->sk->sk_data_ready;
-	server->write_space	= sock->sk->sk_write_space;
-	server->error_report	= sock->sk->sk_error_report;
-	sock->sk->sk_user_data	= server;
 
 	init_timer(&server->timeout_tm);
 #undef NCP_PACKET_SIZE
@@ -625,6 +623,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	if (server->rxbuf == NULL)
 		goto out_txbuf;
 
+	lock_sock(sock->sk);
+	server->data_ready	= sock->sk->sk_data_ready;
+	server->write_space	= sock->sk->sk_write_space;
+	server->error_report	= sock->sk->sk_error_report;
+	sock->sk->sk_user_data	= server;
 	sock->sk->sk_data_ready	  = ncp_tcp_data_ready;
 	sock->sk->sk_error_report = ncp_tcp_error_report;
 	if (sock->type == SOCK_STREAM) {
@@ -640,6 +643,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 		server->timeout_tm.data = (unsigned long)server;
 		server->timeout_tm.function = ncpdgram_timeout_call;
 	}
+	release_sock(sock->sk);
 
 	ncp_lock_server(server);
 	error = ncp_connect(server);
-- 
cgit v1.2.3


From 26daad8067f976a6e3622b42b0d80de9a53e8ad9 Mon Sep 17 00:00:00 2001
From: Márton Németh <nm127@freemail.hu>
Date: Fri, 24 Sep 2010 15:22:22 +0200
Subject: autofs: Only declare function when CONFIG_COMPAT is defined
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The patch solves the following warnings message when CONFIG_COMPAT
is not defined:

fs/autofs/root.c:30: warning: ‘autofs_root_compat_ioctl’ declared ‘static’ but never defined

Signed-off-by: Márton Németh <nm127@freemail.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/autofs/root.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 11b1ea786d00..0c4ca81aeaeb 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -27,7 +27,9 @@ static int autofs_root_unlink(struct inode *,struct dentry *);
 static int autofs_root_rmdir(struct inode *,struct dentry *);
 static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
 static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
 
 const struct file_operations autofs_root_operations = {
 	.llseek		= generic_file_llseek,
-- 
cgit v1.2.3


From 5a44a73b9015e89cfb280ba0534dfcbf1ef29d05 Mon Sep 17 00:00:00 2001
From: Felipe Contreras <felipe.contreras@gmail.com>
Date: Fri, 24 Sep 2010 15:22:23 +0200
Subject: autofs4: Only declare function when CONFIG_COMPAT is defined
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The patch solves the following warnings message when CONFIG_COMPAT
is not defined:

fs/autofs4/root.c:31: warning: ‘autofs4_root_compat_ioctl’ declared ‘static’ but never defined

Signed-off-by: Felipe Contreras <felipe.contreras@gmail.com>
Cc: Ian Kent <raven@themaw.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/autofs4/root.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 9dd29c29fd12..d5c1401f0031 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -28,7 +28,9 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
-- 
cgit v1.2.3


From 80f44b152c889e592616adf0d33b856107f4bace Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 17 Aug 2010 12:14:44 +0200
Subject: quota: Make QUOTACTL config be selected by its users

Remove "depends on" line from QUOTACTL config option and rather select
the option explicitely from config options which need it. It makes more
sense this way and also fixes Kconfig warning due to GFS2 selecting
QUOTACTL but QUOTACTL not depending on it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/Kconfig | 4 ++--
 fs/xfs/Kconfig   | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3a..880fd9884366 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
 
 config QUOTA
 	bool "Quota support"
+	select QUOTACTL
 	help
 	  If you say Y here, you will be able to set per user limits for disk
 	  usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
 
 config QUOTACTL
 	bool
-	depends on XFS_QUOTA || QUOTA
-	default y
+	default n
 
 config QUOTACTL_COMPAT
 	bool
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
 config XFS_QUOTA
 	bool "XFS Quota support"
 	depends on XFS_FS
+	select QUOTACTL
 	help
 	  If you say Y here, you will be able to set limits for disk usage on
 	  a per user and/or a per group basis under XFS.  XFS considers quota
-- 
cgit v1.2.3


From beb37b85b0b727e68e16a39a1e5a2140f87fa201 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 26 Aug 2010 00:54:39 +0200
Subject: ext3: Fix lost extented attributes for inode with ino == 11

If a filesystem has inode size > 128 and someone deletes lost+found and
reuses inode 11 for some other file, extented attributes set for this
inode before umount will get lost after remounting the filesystem. This
is because extended attributes will get stored in an inode but ext3_iget
will ignore them due to workaround of a bug in an old mkfs.

Fix the problem by initializing i_extra_isize to 0 for freshly allocated
inodes where mkfs workaround in ext3_iget applies. This way these inodes
will always store extended attributes in a special block and no problems
occur.

The bug was spotted and a reproduction test provided by:
  Masayoshi MIZUMA <m.mizuma@jp.fujitsu.com>

Reviewed-by: Andreas Dilger <adilger.kernel@dilger.ca>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/ialloc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4ab72db3559e..9724aef22460 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -570,9 +570,14 @@ got:
 	ei->i_state_flags = 0;
 	ext3_set_inode_state(inode, EXT3_STATE_NEW);
 
-	ei->i_extra_isize =
-		(EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
-		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+	/* See comment in ext3_iget for explanation */
+	if (ino >= EXT3_FIRST_INO(sb) + 1 &&
+	    EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize =
+			sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+	} else {
+		ei->i_extra_isize = 0;
+	}
 
 	ret = inode;
 	dquot_initialize(inode);
-- 
cgit v1.2.3


From db7bee24d23d82cc55c7cbc9a1f82d07066d6fce Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 4 Oct 2010 22:28:10 +0200
Subject: autofs3: move to drivers/staging

Nobody appears to be interested in fixing autofs3 bugs
any more and it uses the BKL, which is going away.

Move this to staging for retirement. Unless someone
complains until 2.6.38, we can remove it for good.

The include/linux/auto_fs.h header file is still used
by autofs4, so it remains in place.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Ian Kent <raven@themaw.net>
Cc: autofs@linux.kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 MAINTAINERS                       |   4 +-
 drivers/staging/Kconfig           |   2 +
 drivers/staging/Makefile          |   1 +
 drivers/staging/autofs/Kconfig    |  21 ++
 drivers/staging/autofs/Makefile   |   7 +
 drivers/staging/autofs/autofs_i.h | 165 ++++++++++
 drivers/staging/autofs/dirhash.c  | 250 +++++++++++++++
 drivers/staging/autofs/init.c     |  52 +++
 drivers/staging/autofs/inode.c    | 288 +++++++++++++++++
 drivers/staging/autofs/root.c     | 643 ++++++++++++++++++++++++++++++++++++++
 drivers/staging/autofs/symlink.c  |  26 ++
 drivers/staging/autofs/waitq.c    | 205 ++++++++++++
 fs/Kconfig                        |   1 -
 fs/Makefile                       |   1 -
 fs/autofs/Kconfig                 |  21 --
 fs/autofs/Makefile                |   7 -
 fs/autofs/autofs_i.h              | 165 ----------
 fs/autofs/dirhash.c               | 250 ---------------
 fs/autofs/init.c                  |  52 ---
 fs/autofs/inode.c                 | 288 -----------------
 fs/autofs/root.c                  | 643 --------------------------------------
 fs/autofs/symlink.c               |  26 --
 fs/autofs/waitq.c                 | 205 ------------
 23 files changed, 1662 insertions(+), 1661 deletions(-)
 create mode 100644 drivers/staging/autofs/Kconfig
 create mode 100644 drivers/staging/autofs/Makefile
 create mode 100644 drivers/staging/autofs/autofs_i.h
 create mode 100644 drivers/staging/autofs/dirhash.c
 create mode 100644 drivers/staging/autofs/init.c
 create mode 100644 drivers/staging/autofs/inode.c
 create mode 100644 drivers/staging/autofs/root.c
 create mode 100644 drivers/staging/autofs/symlink.c
 create mode 100644 drivers/staging/autofs/waitq.c
 delete mode 100644 fs/autofs/Kconfig
 delete mode 100644 fs/autofs/Makefile
 delete mode 100644 fs/autofs/autofs_i.h
 delete mode 100644 fs/autofs/dirhash.c
 delete mode 100644 fs/autofs/init.c
 delete mode 100644 fs/autofs/inode.c
 delete mode 100644 fs/autofs/root.c
 delete mode 100644 fs/autofs/symlink.c
 delete mode 100644 fs/autofs/waitq.c

(limited to 'fs')

diff --git a/MAINTAINERS b/MAINTAINERS
index a3f9786cc6fe..0d675a94d630 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3314,8 +3314,8 @@ F:	Documentation/kdump/
 KERNEL AUTOMOUNTER (AUTOFS)
 M:	"H. Peter Anvin" <hpa@zytor.com>
 L:	autofs@linux.kernel.org
-S:	Odd Fixes
-F:	fs/autofs/
+S:	Obsolete
+F:	drivers/staging/autofs/
 
 KERNEL AUTOMOUNTER v4 (AUTOFS4)
 M:	Ian Kent <raven@themaw.net>
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index e8edba7b5623..01503536e457 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -87,6 +87,8 @@ source "drivers/staging/dream/Kconfig"
 
 source "drivers/staging/pohmelfs/Kconfig"
 
+source "drivers/staging/autofs/Kconfig"
+
 source "drivers/staging/phison/Kconfig"
 
 source "drivers/staging/line6/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index d8eeea4e63ae..de5c0e5a99ed 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SPECTRA)		+= spectra/
 obj-$(CONFIG_TRANZPORT)		+= frontier/
 obj-$(CONFIG_DREAM)		+= dream/
 obj-$(CONFIG_POHMELFS)		+= pohmelfs/
+obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_IDE_PHISON)	+= phison/
 obj-$(CONFIG_LINE6_USB)		+= line6/
 obj-$(CONFIG_USB_SERIAL_QUATECH2)	+= serqt_usb2/
diff --git a/drivers/staging/autofs/Kconfig b/drivers/staging/autofs/Kconfig
new file mode 100644
index 000000000000..5f3bea90911e
--- /dev/null
+++ b/drivers/staging/autofs/Kconfig
@@ -0,0 +1,21 @@
+config AUTOFS_FS
+	tristate "Kernel automounter support"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from the autofs
+	  package; you can find the location in <file:Documentation/Changes>.
+	  You also want to answer Y to "NFS file system support", below.
+
+	  If you want to use the newer version of the automounter with more
+	  features, say N here and say Y to "Kernel automounter v4 support",
+	  below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs.
+
+	  If you are not a part of a fairly large, distributed network, you
+	  probably do not need an automounter, and can say N here.
diff --git a/drivers/staging/autofs/Makefile b/drivers/staging/autofs/Makefile
new file mode 100644
index 000000000000..453a60f46d05
--- /dev/null
+++ b/drivers/staging/autofs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux autofs-filesystem routines.
+#
+
+obj-$(CONFIG_AUTOFS_FS) += autofs.o
+
+autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/drivers/staging/autofs/autofs_i.h b/drivers/staging/autofs/autofs_i.h
new file mode 100644
index 000000000000..647a14356e39
--- /dev/null
+++ b/drivers/staging/autofs/autofs_i.h
@@ -0,0 +1,165 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *   
+ * drivers/staging/autofs/autofs_i.h
+ *
+ *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/* Internal header file for autofs */
+
+#include <linux/auto_fs.h>
+
+/* This is the range of ioctl() numbers we claim as ours */
+#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
+#define AUTOFS_IOC_COUNT     32
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/wait.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+
+#include <asm/current.h>
+#include <asm/uaccess.h>
+
+#ifdef DEBUG
+#define DPRINTK(D) (printk D)
+#else
+#define DPRINTK(D) ((void)0)
+#endif
+
+/*
+ * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
+ * kernel will keep the negative response cached for up to the time given
+ * here, although the time can be shorter if the kernel throws the dcache
+ * entry away.  This probably should be settable from user space.
+ */
+#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ)	/* 1 minute */
+
+/* Structures associated with the root directory hash table */
+
+#define AUTOFS_HASH_SIZE 67
+
+struct autofs_dir_ent {
+	int hash;
+	char *name;
+	int len;
+	ino_t ino;
+	struct dentry *dentry;
+	/* Linked list of entries */
+	struct autofs_dir_ent *next;
+	struct autofs_dir_ent **back;
+	/* The following entries are for the expiry system */
+	unsigned long last_usage;
+	struct list_head exp;
+};
+
+struct autofs_dirhash {
+	struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
+	struct list_head expiry_head;
+};
+
+struct autofs_wait_queue {
+	wait_queue_head_t queue;
+	struct autofs_wait_queue *next;
+	autofs_wqt_t wait_queue_token;
+	/* We use the following to see what we are waiting for */
+	int hash;
+	int len;
+	char *name;
+	/* This is for status reporting upon return */
+	int status;
+	int wait_ctr;
+};
+
+struct autofs_symlink {
+	char *data;
+	int len;
+	time_t mtime;
+};
+
+#define AUTOFS_MAX_SYMLINKS 256
+
+#define AUTOFS_ROOT_INO      1
+#define AUTOFS_FIRST_SYMLINK 2
+#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
+
+#define AUTOFS_SYMLINK_BITMAP_LEN \
+	((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
+
+#define AUTOFS_SBI_MAGIC 0x6d4a556d
+
+struct autofs_sb_info {
+	u32 magic;
+	struct file *pipe;
+	struct pid *oz_pgrp;
+	int catatonic;
+	struct super_block *sb;
+	unsigned long exp_timeout;
+	ino_t next_dir_ino;
+	struct autofs_wait_queue *queues; /* Wait queue pointer */
+	struct autofs_dirhash dirhash; /* Root directory hash */
+	struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
+	unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
+};
+
+static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
+{
+	return (struct autofs_sb_info *)(sb->s_fs_info);
+}
+
+/* autofs_oz_mode(): do we see the man behind the curtain?  (The
+   processes which do manipulations for us in user space sees the raw
+   filesystem without "magic".) */
+
+static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
+	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
+}
+
+/* Hash operations */
+
+void autofs_initialize_hash(struct autofs_dirhash *);
+struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
+void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
+void autofs_hash_delete(struct autofs_dir_ent *);
+struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
+void autofs_hash_dputall(struct autofs_dirhash *);
+void autofs_hash_nuke(struct autofs_sb_info *);
+
+/* Expiration-handling functions */
+
+void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
+struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
+
+/* Operations structures */
+
+extern const struct inode_operations autofs_root_inode_operations;
+extern const struct inode_operations autofs_symlink_inode_operations;
+extern const struct file_operations autofs_root_operations;
+
+/* Initializing function */
+
+int autofs_fill_super(struct super_block *, void *, int);
+void autofs_kill_sb(struct super_block *sb);
+struct inode *autofs_iget(struct super_block *, unsigned long);
+
+/* Queue management functions */
+
+int autofs_wait(struct autofs_sb_info *,struct qstr *);
+int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+void autofs_catatonic_mode(struct autofs_sb_info *);
+
+#ifdef DEBUG
+void autofs_say(const char *name, int len);
+#else
+#define autofs_say(n,l) ((void)0)
+#endif
diff --git a/drivers/staging/autofs/dirhash.c b/drivers/staging/autofs/dirhash.c
new file mode 100644
index 000000000000..8f3e2b816129
--- /dev/null
+++ b/drivers/staging/autofs/dirhash.c
@@ -0,0 +1,250 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * drivers/staging/autofs/dirhash.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include "autofs_i.h"
+
+/* Functions for maintenance of expiry queue */
+
+static void autofs_init_usage(struct autofs_dirhash *dh,
+			      struct autofs_dir_ent *ent)
+{
+	list_add_tail(&ent->exp, &dh->expiry_head);
+	ent->last_usage = jiffies;
+}
+
+static void autofs_delete_usage(struct autofs_dir_ent *ent)
+{
+	list_del(&ent->exp);
+}
+
+void autofs_update_usage(struct autofs_dirhash *dh,
+			 struct autofs_dir_ent *ent)
+{
+	autofs_delete_usage(ent);   /* Unlink from current position */
+	autofs_init_usage(dh,ent);  /* Relink at queue tail */
+}
+
+struct autofs_dir_ent *autofs_expire(struct super_block *sb,
+				     struct autofs_sb_info *sbi,
+				     struct vfsmount *mnt)
+{
+	struct autofs_dirhash *dh = &sbi->dirhash;
+	struct autofs_dir_ent *ent;
+	unsigned long timeout = sbi->exp_timeout;
+
+	while (1) {
+		struct path path;
+		int umount_ok;
+
+		if ( list_empty(&dh->expiry_head) || sbi->catatonic )
+			return NULL;	/* No entries */
+		/* We keep the list sorted by last_usage and want old stuff */
+		ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
+		if (jiffies - ent->last_usage < timeout)
+			break;
+		/* Move to end of list in case expiry isn't desirable */
+		autofs_update_usage(dh, ent);
+
+		/* Check to see that entry is expirable */
+		if ( ent->ino < AUTOFS_FIRST_DIR_INO )
+			return ent; /* Symlinks are always expirable */
+
+		/* Get the dentry for the autofs subdirectory */
+		path.dentry = ent->dentry;
+
+		if (!path.dentry) {
+			/* Should only happen in catatonic mode */
+			printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
+			autofs_delete_usage(ent);
+			continue;
+		}
+
+		if (!path.dentry->d_inode) {
+			dput(path.dentry);
+			printk("autofs: negative dentry on expiry queue: %s\n",
+			       ent->name);
+			autofs_delete_usage(ent);
+			continue;
+		}
+
+		/* Make sure entry is mounted and unused; note that dentry will
+		   point to the mounted-on-top root. */
+		if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
+		    !d_mountpoint(path.dentry)) {
+			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
+			continue;
+		}
+		path.mnt = mnt;
+		path_get(&path);
+		if (!follow_down(&path)) {
+			path_put(&path);
+			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
+			continue;
+		}
+		while (d_mountpoint(path.dentry) && follow_down(&path))
+			;
+		umount_ok = may_umount(path.mnt);
+		path_put(&path);
+
+		if (umount_ok) {
+			DPRINTK(("autofs: signaling expire on %s\n", ent->name));
+			return ent; /* Expirable! */
+		}
+		DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
+	}
+	return NULL;		/* No expirable entries */
+}
+
+void autofs_initialize_hash(struct autofs_dirhash *dh) {
+	memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
+	INIT_LIST_HEAD(&dh->expiry_head);
+}
+
+struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
+{
+	struct autofs_dir_ent *dhn;
+
+	DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
+	autofs_say(name->name,name->len);
+
+	for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
+		if ( name->hash == dhn->hash &&
+		     name->len == dhn->len &&
+		     !memcmp(name->name, dhn->name, name->len) )
+			break;
+	}
+
+	return dhn;
+}
+
+void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
+{
+	struct autofs_dir_ent **dhnp;
+
+	DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
+	autofs_say(ent->name,ent->len);
+
+	autofs_init_usage(dh,ent);
+	if (ent->dentry)
+		dget(ent->dentry);
+
+	dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
+	ent->next = *dhnp;
+	ent->back = dhnp;
+	*dhnp = ent;
+	if ( ent->next )
+		ent->next->back = &(ent->next);
+}
+
+void autofs_hash_delete(struct autofs_dir_ent *ent)
+{
+	*(ent->back) = ent->next;
+	if ( ent->next )
+		ent->next->back = ent->back;
+
+	autofs_delete_usage(ent);
+
+	if ( ent->dentry )
+		dput(ent->dentry);
+	kfree(ent->name);
+	kfree(ent);
+}
+
+/*
+ * Used by readdir().  We must validate "ptr", so we can't simply make it
+ * a pointer.  Values below 0xffff are reserved; calling with any value
+ * <= 0x10000 will return the first entry found.
+ *
+ * "last" can be NULL or the value returned by the last search *if* we
+ * want the next sequential entry.
+ */
+struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
+					off_t *ptr, struct autofs_dir_ent *last)
+{
+	int bucket, ecount, i;
+	struct autofs_dir_ent *ent;
+
+	bucket = (*ptr >> 16) - 1;
+	ecount = *ptr & 0xffff;
+
+	if ( bucket < 0 ) {
+		bucket = ecount = 0;
+	} 
+
+	DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
+
+	ent = last ? last->next : NULL;
+
+	if ( ent ) {
+		ecount++;
+	} else {
+		while  ( bucket < AUTOFS_HASH_SIZE ) {
+			ent = dh->h[bucket];
+			for ( i = ecount ; ent && i ; i-- )
+				ent = ent->next;
+			
+			if (ent) {
+				ecount++; /* Point to *next* entry */
+				break;
+			}
+			
+			bucket++; ecount = 0;
+		}
+	}
+
+#ifdef DEBUG
+	if ( !ent )
+		printk("autofs_hash_enum: nothing found\n");
+	else {
+		printk("autofs_hash_enum: found hash %08x, name", ent->hash);
+		autofs_say(ent->name,ent->len);
+	}
+#endif
+
+	*ptr = ((bucket+1) << 16) + ecount;
+	return ent;
+}
+
+/* Iterate over all the ents, and remove all dentry pointers.  Used on
+   entering catatonic mode, in order to make the filesystem unmountable. */
+void autofs_hash_dputall(struct autofs_dirhash *dh)
+{
+	int i;
+	struct autofs_dir_ent *ent;
+
+	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
+		for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
+			if ( ent->dentry ) {
+				dput(ent->dentry);
+				ent->dentry = NULL;
+			}
+		}
+	}
+}
+
+/* Delete everything.  This is used on filesystem destruction, so we
+   make no attempt to keep the pointers valid */
+void autofs_hash_nuke(struct autofs_sb_info *sbi)
+{
+	int i;
+	struct autofs_dir_ent *ent, *nent;
+
+	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
+		for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
+			nent = ent->next;
+			if ( ent->dentry )
+				dput(ent->dentry);
+			kfree(ent->name);
+			kfree(ent);
+		}
+	}
+}
diff --git a/drivers/staging/autofs/init.c b/drivers/staging/autofs/init.c
new file mode 100644
index 000000000000..765c72f42976
--- /dev/null
+++ b/drivers/staging/autofs/init.c
@@ -0,0 +1,52 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * drivers/staging/autofs/init.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include "autofs_i.h"
+
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
+}
+
+static struct file_system_type autofs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "autofs",
+	.get_sb		= autofs_get_sb,
+	.kill_sb	= autofs_kill_sb,
+};
+
+static int __init init_autofs_fs(void)
+{
+	return register_filesystem(&autofs_fs_type);
+}
+
+static void __exit exit_autofs_fs(void)
+{
+	unregister_filesystem(&autofs_fs_type);
+}
+
+module_init(init_autofs_fs);
+module_exit(exit_autofs_fs);
+
+#ifdef DEBUG
+void autofs_say(const char *name, int len)
+{
+	printk("(%d: ", len);
+	while ( len-- )
+		printk("%c", *name++);
+	printk(")\n");
+}
+#endif
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/autofs/inode.c b/drivers/staging/autofs/inode.c
new file mode 100644
index 000000000000..74db190ae845
--- /dev/null
+++ b/drivers/staging/autofs/inode.c
@@ -0,0 +1,288 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * drivers/staging/autofs/inode.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/bitops.h>
+#include <linux/magic.h>
+#include "autofs_i.h"
+#include <linux/module.h>
+
+void autofs_kill_sb(struct super_block *sb)
+{
+	struct autofs_sb_info *sbi = autofs_sbi(sb);
+	unsigned int n;
+
+	/*
+	 * In the event of a failure in get_sb_nodev the superblock
+	 * info is not present so nothing else has been setup, so
+	 * just call kill_anon_super when we are called from
+	 * deactivate_super.
+	 */
+	if (!sbi)
+		goto out_kill_sb;
+
+	if (!sbi->catatonic)
+		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
+
+	put_pid(sbi->oz_pgrp);
+
+	autofs_hash_nuke(sbi);
+	for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
+		if (test_bit(n, sbi->symlink_bitmap))
+			kfree(sbi->symlink[n].data);
+	}
+
+	kfree(sb->s_fs_info);
+
+out_kill_sb:
+	DPRINTK(("autofs: shutting down\n"));
+	kill_anon_super(sb);
+}
+
+static const struct super_operations autofs_sops = {
+	.statfs		= simple_statfs,
+	.show_options	= generic_show_options,
+};
+
+enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
+
+static const match_table_t autofs_tokens = {
+	{Opt_fd, "fd=%u"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_pgrp, "pgrp=%u"},
+	{Opt_minproto, "minproto=%u"},
+	{Opt_maxproto, "maxproto=%u"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+		pid_t *pgrp, int *minproto, int *maxproto)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	*uid = current_uid();
+	*gid = current_gid();
+	*pgrp = task_pgrp_nr(current);
+
+	*minproto = *maxproto = AUTOFS_PROTO_VERSION;
+
+	*pipefd = -1;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, autofs_tokens, args);
+		switch (token) {
+		case Opt_fd:
+			if (match_int(&args[0], &option))
+				return 1;
+			*pipefd = option;
+			break;
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 1;
+			*uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 1;
+			*gid = option;
+			break;
+		case Opt_pgrp:
+			if (match_int(&args[0], &option))
+				return 1;
+			*pgrp = option;
+			break;
+		case Opt_minproto:
+			if (match_int(&args[0], &option))
+				return 1;
+			*minproto = option;
+			break;
+		case Opt_maxproto:
+			if (match_int(&args[0], &option))
+				return 1;
+			*maxproto = option;
+			break;
+		default:
+			return 1;
+		}
+	}
+	return (*pipefd < 0);
+}
+
+int autofs_fill_super(struct super_block *s, void *data, int silent)
+{
+	struct inode * root_inode;
+	struct dentry * root;
+	struct file * pipe;
+	int pipefd;
+	struct autofs_sb_info *sbi;
+	int minproto, maxproto;
+	pid_t pgid;
+
+	save_mount_options(s, data);
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		goto fail_unlock;
+	DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
+
+	s->s_fs_info = sbi;
+	sbi->magic = AUTOFS_SBI_MAGIC;
+	sbi->pipe = NULL;
+	sbi->catatonic = 1;
+	sbi->exp_timeout = 0;
+	autofs_initialize_hash(&sbi->dirhash);
+	sbi->queues = NULL;
+	memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
+	sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = AUTOFS_SUPER_MAGIC;
+	s->s_op = &autofs_sops;
+	s->s_time_gran = 1;
+	sbi->sb = s;
+
+	root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
+	if (IS_ERR(root_inode))
+		goto fail_free;
+	root = d_alloc_root(root_inode);
+	pipe = NULL;
+
+	if (!root)
+		goto fail_iput;
+
+	/* Can this call block?  - WTF cares? s is locked. */
+	if (parse_options(data, &pipefd, &root_inode->i_uid,
+				&root_inode->i_gid, &pgid, &minproto,
+				&maxproto)) {
+		printk("autofs: called with bogus options\n");
+		goto fail_dput;
+	}
+
+	/* Couldn't this be tested earlier? */
+	if (minproto > AUTOFS_PROTO_VERSION ||
+	     maxproto < AUTOFS_PROTO_VERSION) {
+		printk("autofs: kernel does not match daemon version\n");
+		goto fail_dput;
+	}
+
+	DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
+	sbi->oz_pgrp = find_get_pid(pgid);
+
+	if (!sbi->oz_pgrp) {
+		printk("autofs: could not find process group %d\n", pgid);
+		goto fail_dput;
+	}
+
+	pipe = fget(pipefd);
+	
+	if (!pipe) {
+		printk("autofs: could not open pipe file descriptor\n");
+		goto fail_put_pid;
+	}
+
+	if (!pipe->f_op || !pipe->f_op->write)
+		goto fail_fput;
+	sbi->pipe = pipe;
+	sbi->catatonic = 0;
+
+	/*
+	 * Success! Install the root dentry now to indicate completion.
+	 */
+	s->s_root = root;
+	return 0;
+
+fail_fput:
+	printk("autofs: pipe file descriptor does not contain proper ops\n");
+	fput(pipe);
+fail_put_pid:
+	put_pid(sbi->oz_pgrp);
+fail_dput:
+	dput(root);
+	goto fail_free;
+fail_iput:
+	printk("autofs: get root dentry failed\n");
+	iput(root_inode);
+fail_free:
+	kfree(sbi);
+	s->s_fs_info = NULL;
+fail_unlock:
+	return -EINVAL;
+}
+
+struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
+{
+	unsigned int n;
+	struct autofs_sb_info *sbi = autofs_sbi(sb);
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	/* Initialize to the default case (stub directory) */
+
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	if (ino == AUTOFS_ROOT_INO) {
+		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
+		inode->i_op = &autofs_root_inode_operations;
+		inode->i_fop = &autofs_root_operations;
+		goto done;
+	} 
+	
+	inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
+	inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
+	
+	if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
+		/* Symlink inode - should be in symlink list */
+		struct autofs_symlink *sl;
+
+		n = ino - AUTOFS_FIRST_SYMLINK;
+		if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
+			printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
+			goto done;
+		}
+		
+		inode->i_op = &autofs_symlink_inode_operations;
+		sl = &sbi->symlink[n];
+		inode->i_private = sl;
+		inode->i_mode = S_IFLNK | S_IRWXUGO;
+		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
+		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+		inode->i_size = sl->len;
+		inode->i_nlink = 1;
+	}
+
+done:
+	unlock_new_inode(inode);
+	return inode;
+}
diff --git a/drivers/staging/autofs/root.c b/drivers/staging/autofs/root.c
new file mode 100644
index 000000000000..70210aa0692a
--- /dev/null
+++ b/drivers/staging/autofs/root.c
@@ -0,0 +1,643 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * drivers/staging/autofs/root.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include "autofs_i.h"
+
+static int autofs_root_readdir(struct file *,void *,filldir_t);
+static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
+static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
+static int autofs_root_unlink(struct inode *,struct dentry *);
+static int autofs_root_rmdir(struct inode *,struct dentry *);
+static int autofs_root_mkdir(struct inode *,struct dentry *,int);
+static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+
+const struct file_operations autofs_root_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= autofs_root_readdir,
+	.unlocked_ioctl	= autofs_root_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= autofs_root_compat_ioctl,
+#endif
+};
+
+const struct inode_operations autofs_root_inode_operations = {
+        .lookup		= autofs_root_lookup,
+        .unlink		= autofs_root_unlink,
+        .symlink	= autofs_root_symlink,
+        .mkdir		= autofs_root_mkdir,
+        .rmdir		= autofs_root_rmdir,
+};
+
+static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct autofs_dir_ent *ent = NULL;
+	struct autofs_dirhash *dirhash;
+	struct autofs_sb_info *sbi;
+	struct inode * inode = filp->f_path.dentry->d_inode;
+	off_t onr, nr;
+
+	lock_kernel();
+
+	sbi = autofs_sbi(inode->i_sb);
+	dirhash = &sbi->dirhash;
+	nr = filp->f_pos;
+
+	switch(nr)
+	{
+	case 0:
+		if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos = ++nr;
+		/* fall through */
+	case 1:
+		if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos = ++nr;
+		/* fall through */
+	default:
+		while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
+			if (!ent->dentry || d_mountpoint(ent->dentry)) {
+				if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
+					goto out;
+				filp->f_pos = nr;
+			}
+		}
+		break;
+	}
+
+out:
+	unlock_kernel();
+	return 0;
+}
+
+static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
+{
+	struct inode * inode;
+	struct autofs_dir_ent *ent;
+	int status = 0;
+
+	if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
+		do {
+			if (status && dentry->d_inode) {
+				if (status != -ENOENT)
+					printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
+				return 0; /* Try to get the kernel to invalidate this dentry */
+			}
+
+			/* Turn this into a real negative dentry? */
+			if (status == -ENOENT) {
+				dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
+				dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+				return 1;
+			} else if (status) {
+				/* Return a negative dentry, but leave it "pending" */
+				return 1;
+			}
+			status = autofs_wait(sbi, &dentry->d_name);
+		} while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
+	}
+
+	/* Abuse this field as a pointer to the directory entry, used to
+	   find the expire list pointers */
+	dentry->d_time = (unsigned long) ent;
+	
+	if (!dentry->d_inode) {
+		inode = autofs_iget(sb, ent->ino);
+		if (IS_ERR(inode)) {
+			/* Failed, but leave pending for next time */
+			return 1;
+		}
+		dentry->d_inode = inode;
+	}
+
+	/* If this is a directory that isn't a mount point, bitch at the
+	   daemon and fix it in user space */
+	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
+		return !autofs_wait(sbi, &dentry->d_name);
+	}
+
+	/* We don't update the usages for the autofs daemon itself, this
+	   is necessary for recursive autofs mounts */
+	if (!autofs_oz_mode(sbi)) {
+		autofs_update_usage(&sbi->dirhash,ent);
+	}
+
+	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+	return 1;
+}
+
+
+/*
+ * Revalidate is called on every cache lookup.  Some of those
+ * cache lookups may actually happen while the dentry is not
+ * yet completely filled in, and revalidate has to delay such
+ * lookups..
+ */
+static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
+{
+	struct inode * dir;
+	struct autofs_sb_info *sbi;
+	struct autofs_dir_ent *ent;
+	int res;
+
+	lock_kernel();
+	dir = dentry->d_parent->d_inode;
+	sbi = autofs_sbi(dir->i_sb);
+
+	/* Pending dentry */
+	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
+		if (autofs_oz_mode(sbi))
+			res = 1;
+		else
+			res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
+		unlock_kernel();
+		return res;
+	}
+
+	/* Negative dentry.. invalidate if "old" */
+	if (!dentry->d_inode) {
+		unlock_kernel();
+		return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
+	}
+		
+	/* Check for a non-mountpoint directory */
+	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
+		if (autofs_oz_mode(sbi))
+			res = 1;
+		else
+			res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
+		unlock_kernel();
+		return res;
+	}
+
+	/* Update the usage list */
+	if (!autofs_oz_mode(sbi)) {
+		ent = (struct autofs_dir_ent *) dentry->d_time;
+		if (ent)
+			autofs_update_usage(&sbi->dirhash,ent);
+	}
+	unlock_kernel();
+	return 1;
+}
+
+static const struct dentry_operations autofs_dentry_operations = {
+	.d_revalidate	= autofs_revalidate,
+};
+
+static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct autofs_sb_info *sbi;
+	int oz_mode;
+
+	DPRINTK(("autofs_root_lookup: name = "));
+	lock_kernel();
+	autofs_say(dentry->d_name.name,dentry->d_name.len);
+
+	if (dentry->d_name.len > NAME_MAX) {
+		unlock_kernel();
+		return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
+	}
+
+	sbi = autofs_sbi(dir->i_sb);
+
+	oz_mode = autofs_oz_mode(sbi);
+	DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
+				"oz_mode = %d\n", task_pid_nr(current),
+				task_pgrp_nr(current), sbi->catatonic,
+				oz_mode));
+
+	/*
+	 * Mark the dentry incomplete, but add it. This is needed so
+	 * that the VFS layer knows about the dentry, and we can count
+	 * on catching any lookups through the revalidate.
+	 *
+	 * Let all the hard work be done by the revalidate function that
+	 * needs to be able to do this anyway..
+	 *
+	 * We need to do this before we release the directory semaphore.
+	 */
+	dentry->d_op = &autofs_dentry_operations;
+	dentry->d_flags |= DCACHE_AUTOFS_PENDING;
+	d_add(dentry, NULL);
+
+	mutex_unlock(&dir->i_mutex);
+	autofs_revalidate(dentry, nd);
+	mutex_lock(&dir->i_mutex);
+
+	/*
+	 * If we are still pending, check if we had to handle
+	 * a signal. If so we can force a restart..
+	 */
+	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
+		/* See if we were interrupted */
+		if (signal_pending(current)) {
+			sigset_t *sigset = &current->pending.signal;
+			if (sigismember (sigset, SIGKILL) ||
+			    sigismember (sigset, SIGQUIT) ||
+			    sigismember (sigset, SIGINT)) {
+				unlock_kernel();
+				return ERR_PTR(-ERESTARTNOINTR);
+			}
+		}
+	}
+	unlock_kernel();
+
+	/*
+	 * If this dentry is unhashed, then we shouldn't honour this
+	 * lookup even if the dentry is positive.  Returning ENOENT here
+	 * doesn't do the right thing for all system calls, but it should
+	 * be OK for the operations we permit from an autofs.
+	 */
+	if (dentry->d_inode && d_unhashed(dentry))
+		return ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
+static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+	struct autofs_dirhash *dh = &sbi->dirhash;
+	struct autofs_dir_ent *ent;
+	unsigned int n;
+	int slsize;
+	struct autofs_symlink *sl;
+	struct inode *inode;
+
+	DPRINTK(("autofs_root_symlink: %s <- ", symname));
+	autofs_say(dentry->d_name.name,dentry->d_name.len);
+
+	lock_kernel();
+	if (!autofs_oz_mode(sbi)) {
+		unlock_kernel();
+		return -EACCES;
+	}
+
+	if (autofs_hash_lookup(dh, &dentry->d_name)) {
+		unlock_kernel();
+		return -EEXIST;
+	}
+
+	n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
+	if (n >= AUTOFS_MAX_SYMLINKS) {
+		unlock_kernel();
+		return -ENOSPC;
+	}
+
+	set_bit(n,sbi->symlink_bitmap);
+	sl = &sbi->symlink[n];
+	sl->len = strlen(symname);
+	sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
+	if (!sl->data) {
+		clear_bit(n,sbi->symlink_bitmap);
+		unlock_kernel();
+		return -ENOSPC;
+	}
+
+	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
+	if (!ent) {
+		kfree(sl->data);
+		clear_bit(n,sbi->symlink_bitmap);
+		unlock_kernel();
+		return -ENOSPC;
+	}
+
+	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
+	if (!ent->name) {
+		kfree(sl->data);
+		kfree(ent);
+		clear_bit(n,sbi->symlink_bitmap);
+		unlock_kernel();
+		return -ENOSPC;
+	}
+
+	memcpy(sl->data,symname,slsize);
+	sl->mtime = get_seconds();
+
+	ent->ino = AUTOFS_FIRST_SYMLINK + n;
+	ent->hash = dentry->d_name.hash;
+	memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
+	ent->dentry = NULL;	/* We don't keep the dentry for symlinks */
+
+	autofs_hash_insert(dh,ent);
+
+	inode = autofs_iget(dir->i_sb, ent->ino);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, inode);
+	unlock_kernel();
+	return 0;
+}
+
+/*
+ * NOTE!
+ *
+ * Normal filesystems would do a "d_delete()" to tell the VFS dcache
+ * that the file no longer exists. However, doing that means that the
+ * VFS layer can turn the dentry into a negative dentry, which we
+ * obviously do not want (we're dropping the entry not because it
+ * doesn't exist, but because it has timed out).
+ *
+ * Also see autofs_root_rmdir()..
+ */
+static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+	struct autofs_dirhash *dh = &sbi->dirhash;
+	struct autofs_dir_ent *ent;
+	unsigned int n;
+
+	/* This allows root to remove symlinks */
+	lock_kernel();
+	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
+		unlock_kernel();
+		return -EACCES;
+	}
+
+	ent = autofs_hash_lookup(dh, &dentry->d_name);
+	if (!ent) {
+		unlock_kernel();
+		return -ENOENT;
+	}
+
+	n = ent->ino - AUTOFS_FIRST_SYMLINK;
+	if (n >= AUTOFS_MAX_SYMLINKS) {
+		unlock_kernel();
+		return -EISDIR;	/* It's a directory, dummy */
+	}
+	if (!test_bit(n,sbi->symlink_bitmap)) {
+		unlock_kernel();
+		return -EINVAL;	/* Nonexistent symlink?  Shouldn't happen */
+	}
+	
+	dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
+	autofs_hash_delete(ent);
+	clear_bit(n,sbi->symlink_bitmap);
+	kfree(sbi->symlink[n].data);
+	d_drop(dentry);
+	
+	unlock_kernel();
+	return 0;
+}
+
+static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+	struct autofs_dirhash *dh = &sbi->dirhash;
+	struct autofs_dir_ent *ent;
+
+	lock_kernel();
+	if (!autofs_oz_mode(sbi)) {
+		unlock_kernel();
+		return -EACCES;
+	}
+
+	ent = autofs_hash_lookup(dh, &dentry->d_name);
+	if (!ent) {
+		unlock_kernel();
+		return -ENOENT;
+	}
+
+	if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
+		unlock_kernel();
+		return -ENOTDIR; /* Not a directory */
+	}
+
+	if (ent->dentry != dentry) {
+		printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
+	}
+
+	dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
+	autofs_hash_delete(ent);
+	drop_nlink(dir);
+	d_drop(dentry);
+	unlock_kernel();
+
+	return 0;
+}
+
+static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+	struct autofs_dirhash *dh = &sbi->dirhash;
+	struct autofs_dir_ent *ent;
+	struct inode *inode;
+	ino_t ino;
+
+	lock_kernel();
+	if (!autofs_oz_mode(sbi)) {
+		unlock_kernel();
+		return -EACCES;
+	}
+
+	ent = autofs_hash_lookup(dh, &dentry->d_name);
+	if (ent) {
+		unlock_kernel();
+		return -EEXIST;
+	}
+
+	if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
+		printk("autofs: Out of inode numbers -- what the heck did you do??\n");
+		unlock_kernel();
+		return -ENOSPC;
+	}
+	ino = sbi->next_dir_ino++;
+
+	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
+	if (!ent) {
+		unlock_kernel();
+		return -ENOSPC;
+	}
+
+	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
+	if (!ent->name) {
+		kfree(ent);
+		unlock_kernel();
+		return -ENOSPC;
+	}
+
+	ent->hash = dentry->d_name.hash;
+	memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
+	ent->ino = ino;
+	ent->dentry = dentry;
+	autofs_hash_insert(dh,ent);
+
+	inc_nlink(dir);
+
+	inode = autofs_iget(dir->i_sb, ino);
+	if (IS_ERR(inode)) {
+		drop_nlink(dir);
+		return PTR_ERR(inode);
+	}
+
+	d_instantiate(dentry, inode);
+	unlock_kernel();
+
+	return 0;
+}
+
+/* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
+					 unsigned int __user *p)
+{
+	unsigned long ntimeout;
+
+	if (get_user(ntimeout, p) ||
+	    put_user(sbi->exp_timeout / HZ, p))
+		return -EFAULT;
+
+	if (ntimeout > UINT_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+#endif
+
+static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
+					 unsigned long __user *p)
+{
+	unsigned long ntimeout;
+
+	if (get_user(ntimeout, p) ||
+	    put_user(sbi->exp_timeout / HZ, p))
+		return -EFAULT;
+
+	if (ntimeout > ULONG_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+
+/* Return protocol version */
+static inline int autofs_get_protover(int __user *p)
+{
+	return put_user(AUTOFS_PROTO_VERSION, p);
+}
+
+/* Perform an expiry operation */
+static inline int autofs_expire_run(struct super_block *sb,
+				    struct autofs_sb_info *sbi,
+				    struct vfsmount *mnt,
+				    struct autofs_packet_expire __user *pkt_p)
+{
+	struct autofs_dir_ent *ent;
+	struct autofs_packet_expire pkt;
+
+	memset(&pkt,0,sizeof pkt);
+
+	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
+	pkt.hdr.type = autofs_ptype_expire;
+
+	if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
+		return -EAGAIN;
+
+	pkt.len = ent->len;
+	memcpy(pkt.name, ent->name, pkt.len);
+	pkt.name[pkt.len] = '\0';
+
+	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * ioctl()'s on the root directory is the chief method for the daemon to
+ * generate kernel reactions
+ */
+static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
+	void __user *argp = (void __user *)arg;
+
+	DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
+
+	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
+	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
+		return -ENOTTY;
+	
+	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	
+	switch(cmd) {
+	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
+		return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
+	case AUTOFS_IOC_FAIL:	/* Wait queue: fail with ENOENT */
+		return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+	case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
+		autofs_catatonic_mode(sbi);
+		return 0;
+	case AUTOFS_IOC_PROTOVER: /* Get protocol version */
+		return autofs_get_protover(argp);
+#ifdef CONFIG_COMPAT
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return autofs_compat_get_set_timeout(sbi, argp);
+#endif
+	case AUTOFS_IOC_SETTIMEOUT:
+		return autofs_get_set_timeout(sbi, argp);
+	case AUTOFS_IOC_EXPIRE:
+		return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
+					 argp);
+	default:
+		return -ENOSYS;
+	}
+
+}
+
+static long autofs_root_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	lock_kernel();
+	ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
+				   filp, cmd, arg);
+	unlock_kernel();
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs_root_compat_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret;
+
+	lock_kernel();
+	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+		ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
+	else
+		ret = autofs_do_root_ioctl(inode, filp, cmd,
+			(unsigned long)compat_ptr(arg));
+	unlock_kernel();
+
+	return ret;
+}
+#endif
diff --git a/drivers/staging/autofs/symlink.c b/drivers/staging/autofs/symlink.c
new file mode 100644
index 000000000000..ff2c65cde753
--- /dev/null
+++ b/drivers/staging/autofs/symlink.c
@@ -0,0 +1,26 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * drivers/staging/autofs/symlink.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include "autofs_i.h"
+
+/* Nothing to release.. */
+static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
+	nd_set_link(nd, s);
+	return NULL;
+}
+
+const struct inode_operations autofs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= autofs_follow_link
+};
diff --git a/drivers/staging/autofs/waitq.c b/drivers/staging/autofs/waitq.c
new file mode 100644
index 000000000000..d3c8cc9eb4d1
--- /dev/null
+++ b/drivers/staging/autofs/waitq.c
@@ -0,0 +1,205 @@
+/* -*- linux-c -*- --------------------------------------------------------- *
+ *
+ * drivers/staging/autofs/waitq.c
+ *
+ *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * ------------------------------------------------------------------------- */
+
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/file.h>
+#include "autofs_i.h"
+
+/* We make this a static variable rather than a part of the superblock; it
+   is better if we don't reassign numbers easily even across filesystems */
+static autofs_wqt_t autofs_next_wait_queue = 1;
+
+/* These are the signals we allow interrupting a pending mount */
+#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
+
+void autofs_catatonic_mode(struct autofs_sb_info *sbi)
+{
+	struct autofs_wait_queue *wq, *nwq;
+
+	DPRINTK(("autofs: entering catatonic mode\n"));
+
+	sbi->catatonic = 1;
+	wq = sbi->queues;
+	sbi->queues = NULL;	/* Erase all wait queues */
+	while ( wq ) {
+		nwq = wq->next;
+		wq->status = -ENOENT; /* Magic is gone - report failure */
+		kfree(wq->name);
+		wq->name = NULL;
+		wake_up(&wq->queue);
+		wq = nwq;
+	}
+	fput(sbi->pipe);	/* Close the pipe */
+	sbi->pipe = NULL;
+	autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
+}
+
+static int autofs_write(struct file *file, const void *addr, int bytes)
+{
+	unsigned long sigpipe, flags;
+	mm_segment_t fs;
+	const char *data = (const char *)addr;
+	ssize_t wr = 0;
+
+	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
+
+	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
+
+	/* Save pointer to user space and point back to kernel space */
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	while (bytes &&
+	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
+		data += wr;
+		bytes -= wr;
+	}
+
+	set_fs(fs);
+
+	/* Keep the currently executing process from receiving a
+	   SIGPIPE unless it was already supposed to get one */
+	if (wr == -EPIPE && !sigpipe) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		sigdelset(&current->pending.signal, SIGPIPE);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+
+	return (bytes > 0);
+}
+	
+static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
+{
+	struct autofs_packet_missing pkt;
+
+	DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
+	autofs_say(wq->name,wq->len);
+
+	memset(&pkt,0,sizeof pkt); /* For security reasons */
+
+	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
+	pkt.hdr.type = autofs_ptype_missing;
+	pkt.wait_queue_token = wq->wait_queue_token;
+	pkt.len = wq->len;
+        memcpy(pkt.name, wq->name, pkt.len);
+	pkt.name[pkt.len] = '\0';
+
+	if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
+		autofs_catatonic_mode(sbi);
+}
+
+int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
+{
+	struct autofs_wait_queue *wq;
+	int status;
+
+	/* In catatonic mode, we don't wait for nobody */
+	if ( sbi->catatonic )
+		return -ENOENT;
+	
+	/* We shouldn't be able to get here, but just in case */
+	if ( name->len > NAME_MAX )
+		return -ENOENT;
+
+	for ( wq = sbi->queues ; wq ; wq = wq->next ) {
+		if ( wq->hash == name->hash &&
+		     wq->len == name->len &&
+		     wq->name && !memcmp(wq->name,name->name,name->len) )
+			break;
+	}
+	
+	if ( !wq ) {
+		/* Create a new wait queue */
+		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+		if ( !wq )
+			return -ENOMEM;
+
+		wq->name = kmalloc(name->len,GFP_KERNEL);
+		if ( !wq->name ) {
+			kfree(wq);
+			return -ENOMEM;
+		}
+		wq->wait_queue_token = autofs_next_wait_queue++;
+		init_waitqueue_head(&wq->queue);
+		wq->hash = name->hash;
+		wq->len = name->len;
+		wq->status = -EINTR; /* Status return if interrupted */
+		memcpy(wq->name, name->name, name->len);
+		wq->next = sbi->queues;
+		sbi->queues = wq;
+
+		/* autofs_notify_daemon() may block */
+		wq->wait_ctr = 2;
+		autofs_notify_daemon(sbi,wq);
+	} else
+		wq->wait_ctr++;
+
+	/* wq->name is NULL if and only if the lock is already released */
+
+	if ( sbi->catatonic ) {
+		/* We might have slept, so check again for catatonic mode */
+		wq->status = -ENOENT;
+		kfree(wq->name);
+		wq->name = NULL;
+	}
+
+	if ( wq->name ) {
+		/* Block all but "shutdown" signals while waiting */
+		sigset_t sigmask;
+
+		siginitsetinv(&sigmask, SHUTDOWN_SIGS);
+		sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
+
+		interruptible_sleep_on(&wq->queue);
+
+		sigprocmask(SIG_SETMASK, &sigmask, NULL);
+	} else {
+		DPRINTK(("autofs_wait: skipped sleeping\n"));
+	}
+
+	status = wq->status;
+
+	if ( ! --wq->wait_ctr )	/* Are we the last process to need status? */
+		kfree(wq);
+
+	return status;
+}
+
+
+int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+{
+	struct autofs_wait_queue *wq, **wql;
+
+	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
+		if ( wq->wait_queue_token == wait_queue_token )
+			break;
+	}
+	if ( !wq )
+		return -EINVAL;
+
+	*wql = wq->next;	/* Unlink from chain */
+	kfree(wq->name);
+	wq->name = NULL;	/* Do not wait on this queue */
+
+	wq->status = status;
+
+	if ( ! --wq->wait_ctr )	/* Is anyone still waiting for this guy? */
+		kfree(wq);
+	else
+		wake_up(&wq->queue);
+
+	return 0;
+}
+
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..30da8ee16a96 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -59,7 +59,6 @@ source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
 
-source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..e571feddd7b7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,7 +104,6 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
 obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
-obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911e..000000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
-config AUTOFS_FS
-	tristate "Kernel automounter support"
-	help
-	  The automounter is a tool to automatically mount remote file systems
-	  on demand. This implementation is partially kernel-based to reduce
-	  overhead in the already-mounted case; this is unlike the BSD
-	  automounter (amd), which is a pure user space daemon.
-
-	  To use the automounter you need the user-space tools from the autofs
-	  package; you can find the location in <file:Documentation/Changes>.
-	  You also want to answer Y to "NFS file system support", below.
-
-	  If you want to use the newer version of the automounter with more
-	  features, say N here and say Y to "Kernel automounter v4 support",
-	  below.
-
-	  To compile this support as a module, choose M here: the module will be
-	  called autofs.
-
-	  If you are not a part of a fairly large, distributed network, you
-	  probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d05..000000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux autofs-filesystem routines.
-#
-
-obj-$(CONFIG_AUTOFS_FS) += autofs.o
-
-autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec45..000000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/fs/autofs/autofs_i.h
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/* Internal header file for autofs */
-
-#include <linux/auto_fs.h>
-
-/* This is the range of ioctl() numbers we claim as ours */
-#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
-#define AUTOFS_IOC_COUNT     32
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-
-#include <asm/current.h>
-#include <asm/uaccess.h>
-
-#ifdef DEBUG
-#define DPRINTK(D) (printk D)
-#else
-#define DPRINTK(D) ((void)0)
-#endif
-
-/*
- * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
- * kernel will keep the negative response cached for up to the time given
- * here, although the time can be shorter if the kernel throws the dcache
- * entry away.  This probably should be settable from user space.
- */
-#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ)	/* 1 minute */
-
-/* Structures associated with the root directory hash table */
-
-#define AUTOFS_HASH_SIZE 67
-
-struct autofs_dir_ent {
-	int hash;
-	char *name;
-	int len;
-	ino_t ino;
-	struct dentry *dentry;
-	/* Linked list of entries */
-	struct autofs_dir_ent *next;
-	struct autofs_dir_ent **back;
-	/* The following entries are for the expiry system */
-	unsigned long last_usage;
-	struct list_head exp;
-};
-
-struct autofs_dirhash {
-	struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
-	struct list_head expiry_head;
-};
-
-struct autofs_wait_queue {
-	wait_queue_head_t queue;
-	struct autofs_wait_queue *next;
-	autofs_wqt_t wait_queue_token;
-	/* We use the following to see what we are waiting for */
-	int hash;
-	int len;
-	char *name;
-	/* This is for status reporting upon return */
-	int status;
-	int wait_ctr;
-};
-
-struct autofs_symlink {
-	char *data;
-	int len;
-	time_t mtime;
-};
-
-#define AUTOFS_MAX_SYMLINKS 256
-
-#define AUTOFS_ROOT_INO      1
-#define AUTOFS_FIRST_SYMLINK 2
-#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
-
-#define AUTOFS_SYMLINK_BITMAP_LEN \
-	((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
-
-#define AUTOFS_SBI_MAGIC 0x6d4a556d
-
-struct autofs_sb_info {
-	u32 magic;
-	struct file *pipe;
-	struct pid *oz_pgrp;
-	int catatonic;
-	struct super_block *sb;
-	unsigned long exp_timeout;
-	ino_t next_dir_ino;
-	struct autofs_wait_queue *queues; /* Wait queue pointer */
-	struct autofs_dirhash dirhash; /* Root directory hash */
-	struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
-	unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
-};
-
-static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
-{
-	return (struct autofs_sb_info *)(sb->s_fs_info);
-}
-
-/* autofs_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
-
-static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
-	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
-}
-
-/* Hash operations */
-
-void autofs_initialize_hash(struct autofs_dirhash *);
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
-void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
-void autofs_hash_delete(struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
-void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_sb_info *);
-
-/* Expiration-handling functions */
-
-void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
-
-/* Operations structures */
-
-extern const struct inode_operations autofs_root_inode_operations;
-extern const struct inode_operations autofs_symlink_inode_operations;
-extern const struct file_operations autofs_root_operations;
-
-/* Initializing function */
-
-int autofs_fill_super(struct super_block *, void *, int);
-void autofs_kill_sb(struct super_block *sb);
-struct inode *autofs_iget(struct super_block *, unsigned long);
-
-/* Queue management functions */
-
-int autofs_wait(struct autofs_sb_info *,struct qstr *);
-int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
-void autofs_catatonic_mode(struct autofs_sb_info *);
-
-#ifdef DEBUG
-void autofs_say(const char *name, int len);
-#else
-#define autofs_say(n,l) ((void)0)
-#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e5..000000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/dirhash.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include "autofs_i.h"
-
-/* Functions for maintenance of expiry queue */
-
-static void autofs_init_usage(struct autofs_dirhash *dh,
-			      struct autofs_dir_ent *ent)
-{
-	list_add_tail(&ent->exp, &dh->expiry_head);
-	ent->last_usage = jiffies;
-}
-
-static void autofs_delete_usage(struct autofs_dir_ent *ent)
-{
-	list_del(&ent->exp);
-}
-
-void autofs_update_usage(struct autofs_dirhash *dh,
-			 struct autofs_dir_ent *ent)
-{
-	autofs_delete_usage(ent);   /* Unlink from current position */
-	autofs_init_usage(dh,ent);  /* Relink at queue tail */
-}
-
-struct autofs_dir_ent *autofs_expire(struct super_block *sb,
-				     struct autofs_sb_info *sbi,
-				     struct vfsmount *mnt)
-{
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	unsigned long timeout = sbi->exp_timeout;
-
-	while (1) {
-		struct path path;
-		int umount_ok;
-
-		if ( list_empty(&dh->expiry_head) || sbi->catatonic )
-			return NULL;	/* No entries */
-		/* We keep the list sorted by last_usage and want old stuff */
-		ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
-		if (jiffies - ent->last_usage < timeout)
-			break;
-		/* Move to end of list in case expiry isn't desirable */
-		autofs_update_usage(dh, ent);
-
-		/* Check to see that entry is expirable */
-		if ( ent->ino < AUTOFS_FIRST_DIR_INO )
-			return ent; /* Symlinks are always expirable */
-
-		/* Get the dentry for the autofs subdirectory */
-		path.dentry = ent->dentry;
-
-		if (!path.dentry) {
-			/* Should only happen in catatonic mode */
-			printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
-			autofs_delete_usage(ent);
-			continue;
-		}
-
-		if (!path.dentry->d_inode) {
-			dput(path.dentry);
-			printk("autofs: negative dentry on expiry queue: %s\n",
-			       ent->name);
-			autofs_delete_usage(ent);
-			continue;
-		}
-
-		/* Make sure entry is mounted and unused; note that dentry will
-		   point to the mounted-on-top root. */
-		if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
-		    !d_mountpoint(path.dentry)) {
-			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-			continue;
-		}
-		path.mnt = mnt;
-		path_get(&path);
-		if (!follow_down(&path)) {
-			path_put(&path);
-			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-			continue;
-		}
-		while (d_mountpoint(path.dentry) && follow_down(&path))
-			;
-		umount_ok = may_umount(path.mnt);
-		path_put(&path);
-
-		if (umount_ok) {
-			DPRINTK(("autofs: signaling expire on %s\n", ent->name));
-			return ent; /* Expirable! */
-		}
-		DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-	}
-	return NULL;		/* No expirable entries */
-}
-
-void autofs_initialize_hash(struct autofs_dirhash *dh) {
-	memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
-	INIT_LIST_HEAD(&dh->expiry_head);
-}
-
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
-{
-	struct autofs_dir_ent *dhn;
-
-	DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
-	autofs_say(name->name,name->len);
-
-	for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
-		if ( name->hash == dhn->hash &&
-		     name->len == dhn->len &&
-		     !memcmp(name->name, dhn->name, name->len) )
-			break;
-	}
-
-	return dhn;
-}
-
-void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
-{
-	struct autofs_dir_ent **dhnp;
-
-	DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
-	autofs_say(ent->name,ent->len);
-
-	autofs_init_usage(dh,ent);
-	if (ent->dentry)
-		dget(ent->dentry);
-
-	dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
-	ent->next = *dhnp;
-	ent->back = dhnp;
-	*dhnp = ent;
-	if ( ent->next )
-		ent->next->back = &(ent->next);
-}
-
-void autofs_hash_delete(struct autofs_dir_ent *ent)
-{
-	*(ent->back) = ent->next;
-	if ( ent->next )
-		ent->next->back = ent->back;
-
-	autofs_delete_usage(ent);
-
-	if ( ent->dentry )
-		dput(ent->dentry);
-	kfree(ent->name);
-	kfree(ent);
-}
-
-/*
- * Used by readdir().  We must validate "ptr", so we can't simply make it
- * a pointer.  Values below 0xffff are reserved; calling with any value
- * <= 0x10000 will return the first entry found.
- *
- * "last" can be NULL or the value returned by the last search *if* we
- * want the next sequential entry.
- */
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
-					off_t *ptr, struct autofs_dir_ent *last)
-{
-	int bucket, ecount, i;
-	struct autofs_dir_ent *ent;
-
-	bucket = (*ptr >> 16) - 1;
-	ecount = *ptr & 0xffff;
-
-	if ( bucket < 0 ) {
-		bucket = ecount = 0;
-	} 
-
-	DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
-
-	ent = last ? last->next : NULL;
-
-	if ( ent ) {
-		ecount++;
-	} else {
-		while  ( bucket < AUTOFS_HASH_SIZE ) {
-			ent = dh->h[bucket];
-			for ( i = ecount ; ent && i ; i-- )
-				ent = ent->next;
-			
-			if (ent) {
-				ecount++; /* Point to *next* entry */
-				break;
-			}
-			
-			bucket++; ecount = 0;
-		}
-	}
-
-#ifdef DEBUG
-	if ( !ent )
-		printk("autofs_hash_enum: nothing found\n");
-	else {
-		printk("autofs_hash_enum: found hash %08x, name", ent->hash);
-		autofs_say(ent->name,ent->len);
-	}
-#endif
-
-	*ptr = ((bucket+1) << 16) + ecount;
-	return ent;
-}
-
-/* Iterate over all the ents, and remove all dentry pointers.  Used on
-   entering catatonic mode, in order to make the filesystem unmountable. */
-void autofs_hash_dputall(struct autofs_dirhash *dh)
-{
-	int i;
-	struct autofs_dir_ent *ent;
-
-	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-		for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
-			if ( ent->dentry ) {
-				dput(ent->dentry);
-				ent->dentry = NULL;
-			}
-		}
-	}
-}
-
-/* Delete everything.  This is used on filesystem destruction, so we
-   make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_sb_info *sbi)
-{
-	int i;
-	struct autofs_dir_ent *ent, *nent;
-
-	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-		for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
-			nent = ent->next;
-			if ( ent->dentry )
-				dput(ent->dentry);
-			kfree(ent->name);
-			kfree(ent);
-		}
-	}
-}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f37..000000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include "autofs_i.h"
-
-static int autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
-}
-
-static struct file_system_type autofs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "autofs",
-	.get_sb		= autofs_get_sb,
-	.kill_sb	= autofs_kill_sb,
-};
-
-static int __init init_autofs_fs(void)
-{
-	return register_filesystem(&autofs_fs_type);
-}
-
-static void __exit exit_autofs_fs(void)
-{
-	unregister_filesystem(&autofs_fs_type);
-}
-
-module_init(init_autofs_fs);
-module_exit(exit_autofs_fs);
-
-#ifdef DEBUG
-void autofs_say(const char *name, int len)
-{
-	printk("(%d: ", len);
-	while ( len-- )
-		printk("%c", *name++);
-	printk(")\n");
-}
-#endif
-MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e26..000000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/parser.h>
-#include <linux/bitops.h>
-#include <linux/magic.h>
-#include "autofs_i.h"
-#include <linux/module.h>
-
-void autofs_kill_sb(struct super_block *sb)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(sb);
-	unsigned int n;
-
-	/*
-	 * In the event of a failure in get_sb_nodev the superblock
-	 * info is not present so nothing else has been setup, so
-	 * just call kill_anon_super when we are called from
-	 * deactivate_super.
-	 */
-	if (!sbi)
-		goto out_kill_sb;
-
-	if (!sbi->catatonic)
-		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
-
-	put_pid(sbi->oz_pgrp);
-
-	autofs_hash_nuke(sbi);
-	for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
-		if (test_bit(n, sbi->symlink_bitmap))
-			kfree(sbi->symlink[n].data);
-	}
-
-	kfree(sb->s_fs_info);
-
-out_kill_sb:
-	DPRINTK(("autofs: shutting down\n"));
-	kill_anon_super(sb);
-}
-
-static const struct super_operations autofs_sops = {
-	.statfs		= simple_statfs,
-	.show_options	= generic_show_options,
-};
-
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-
-static const match_table_t autofs_tokens = {
-	{Opt_fd, "fd=%u"},
-	{Opt_uid, "uid=%u"},
-	{Opt_gid, "gid=%u"},
-	{Opt_pgrp, "pgrp=%u"},
-	{Opt_minproto, "minproto=%u"},
-	{Opt_maxproto, "maxproto=%u"},
-	{Opt_err, NULL}
-};
-
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-		pid_t *pgrp, int *minproto, int *maxproto)
-{
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-
-	*uid = current_uid();
-	*gid = current_gid();
-	*pgrp = task_pgrp_nr(current);
-
-	*minproto = *maxproto = AUTOFS_PROTO_VERSION;
-
-	*pipefd = -1;
-
-	if (!options)
-		return 1;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
-
-		token = match_token(p, autofs_tokens, args);
-		switch (token) {
-		case Opt_fd:
-			if (match_int(&args[0], &option))
-				return 1;
-			*pipefd = option;
-			break;
-		case Opt_uid:
-			if (match_int(&args[0], &option))
-				return 1;
-			*uid = option;
-			break;
-		case Opt_gid:
-			if (match_int(&args[0], &option))
-				return 1;
-			*gid = option;
-			break;
-		case Opt_pgrp:
-			if (match_int(&args[0], &option))
-				return 1;
-			*pgrp = option;
-			break;
-		case Opt_minproto:
-			if (match_int(&args[0], &option))
-				return 1;
-			*minproto = option;
-			break;
-		case Opt_maxproto:
-			if (match_int(&args[0], &option))
-				return 1;
-			*maxproto = option;
-			break;
-		default:
-			return 1;
-		}
-	}
-	return (*pipefd < 0);
-}
-
-int autofs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct inode * root_inode;
-	struct dentry * root;
-	struct file * pipe;
-	int pipefd;
-	struct autofs_sb_info *sbi;
-	int minproto, maxproto;
-	pid_t pgid;
-
-	save_mount_options(s, data);
-
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		goto fail_unlock;
-	DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
-
-	s->s_fs_info = sbi;
-	sbi->magic = AUTOFS_SBI_MAGIC;
-	sbi->pipe = NULL;
-	sbi->catatonic = 1;
-	sbi->exp_timeout = 0;
-	autofs_initialize_hash(&sbi->dirhash);
-	sbi->queues = NULL;
-	memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
-	sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
-	s->s_blocksize = 1024;
-	s->s_blocksize_bits = 10;
-	s->s_magic = AUTOFS_SUPER_MAGIC;
-	s->s_op = &autofs_sops;
-	s->s_time_gran = 1;
-	sbi->sb = s;
-
-	root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
-	if (IS_ERR(root_inode))
-		goto fail_free;
-	root = d_alloc_root(root_inode);
-	pipe = NULL;
-
-	if (!root)
-		goto fail_iput;
-
-	/* Can this call block?  - WTF cares? s is locked. */
-	if (parse_options(data, &pipefd, &root_inode->i_uid,
-				&root_inode->i_gid, &pgid, &minproto,
-				&maxproto)) {
-		printk("autofs: called with bogus options\n");
-		goto fail_dput;
-	}
-
-	/* Couldn't this be tested earlier? */
-	if (minproto > AUTOFS_PROTO_VERSION ||
-	     maxproto < AUTOFS_PROTO_VERSION) {
-		printk("autofs: kernel does not match daemon version\n");
-		goto fail_dput;
-	}
-
-	DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
-	sbi->oz_pgrp = find_get_pid(pgid);
-
-	if (!sbi->oz_pgrp) {
-		printk("autofs: could not find process group %d\n", pgid);
-		goto fail_dput;
-	}
-
-	pipe = fget(pipefd);
-	
-	if (!pipe) {
-		printk("autofs: could not open pipe file descriptor\n");
-		goto fail_put_pid;
-	}
-
-	if (!pipe->f_op || !pipe->f_op->write)
-		goto fail_fput;
-	sbi->pipe = pipe;
-	sbi->catatonic = 0;
-
-	/*
-	 * Success! Install the root dentry now to indicate completion.
-	 */
-	s->s_root = root;
-	return 0;
-
-fail_fput:
-	printk("autofs: pipe file descriptor does not contain proper ops\n");
-	fput(pipe);
-fail_put_pid:
-	put_pid(sbi->oz_pgrp);
-fail_dput:
-	dput(root);
-	goto fail_free;
-fail_iput:
-	printk("autofs: get root dentry failed\n");
-	iput(root_inode);
-fail_free:
-	kfree(sbi);
-	s->s_fs_info = NULL;
-fail_unlock:
-	return -EINVAL;
-}
-
-struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
-{
-	unsigned int n;
-	struct autofs_sb_info *sbi = autofs_sbi(sb);
-	struct inode *inode;
-
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
-
-	/* Initialize to the default case (stub directory) */
-
-	inode->i_op = &simple_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
-	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
-	if (ino == AUTOFS_ROOT_INO) {
-		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
-		inode->i_op = &autofs_root_inode_operations;
-		inode->i_fop = &autofs_root_operations;
-		goto done;
-	} 
-	
-	inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
-	inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
-	
-	if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
-		/* Symlink inode - should be in symlink list */
-		struct autofs_symlink *sl;
-
-		n = ino - AUTOFS_FIRST_SYMLINK;
-		if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
-			printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
-			goto done;
-		}
-		
-		inode->i_op = &autofs_symlink_inode_operations;
-		sl = &sbi->symlink[n];
-		inode->i_private = sl;
-		inode->i_mode = S_IFLNK | S_IRWXUGO;
-		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
-		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-		inode->i_size = sl->len;
-		inode->i_nlink = 1;
-	}
-
-done:
-	unlock_new_inode(inode);
-	return inode;
-}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 11b1ea786d00..000000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,643 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/param.h>
-#include <linux/time.h>
-#include <linux/compat.h>
-#include <linux/smp_lock.h>
-#include "autofs_i.h"
-
-static int autofs_root_readdir(struct file *,void *,filldir_t);
-static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
-static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
-static int autofs_root_unlink(struct inode *,struct dentry *);
-static int autofs_root_rmdir(struct inode *,struct dentry *);
-static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
-static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
-
-const struct file_operations autofs_root_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= autofs_root_readdir,
-	.unlocked_ioctl	= autofs_root_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= autofs_root_compat_ioctl,
-#endif
-};
-
-const struct inode_operations autofs_root_inode_operations = {
-        .lookup		= autofs_root_lookup,
-        .unlink		= autofs_root_unlink,
-        .symlink	= autofs_root_symlink,
-        .mkdir		= autofs_root_mkdir,
-        .rmdir		= autofs_root_rmdir,
-};
-
-static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct autofs_dir_ent *ent = NULL;
-	struct autofs_dirhash *dirhash;
-	struct autofs_sb_info *sbi;
-	struct inode * inode = filp->f_path.dentry->d_inode;
-	off_t onr, nr;
-
-	lock_kernel();
-
-	sbi = autofs_sbi(inode->i_sb);
-	dirhash = &sbi->dirhash;
-	nr = filp->f_pos;
-
-	switch(nr)
-	{
-	case 0:
-		if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = ++nr;
-		/* fall through */
-	case 1:
-		if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = ++nr;
-		/* fall through */
-	default:
-		while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
-			if (!ent->dentry || d_mountpoint(ent->dentry)) {
-				if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
-					goto out;
-				filp->f_pos = nr;
-			}
-		}
-		break;
-	}
-
-out:
-	unlock_kernel();
-	return 0;
-}
-
-static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
-{
-	struct inode * inode;
-	struct autofs_dir_ent *ent;
-	int status = 0;
-
-	if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
-		do {
-			if (status && dentry->d_inode) {
-				if (status != -ENOENT)
-					printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
-				return 0; /* Try to get the kernel to invalidate this dentry */
-			}
-
-			/* Turn this into a real negative dentry? */
-			if (status == -ENOENT) {
-				dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
-				dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-				return 1;
-			} else if (status) {
-				/* Return a negative dentry, but leave it "pending" */
-				return 1;
-			}
-			status = autofs_wait(sbi, &dentry->d_name);
-		} while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
-	}
-
-	/* Abuse this field as a pointer to the directory entry, used to
-	   find the expire list pointers */
-	dentry->d_time = (unsigned long) ent;
-	
-	if (!dentry->d_inode) {
-		inode = autofs_iget(sb, ent->ino);
-		if (IS_ERR(inode)) {
-			/* Failed, but leave pending for next time */
-			return 1;
-		}
-		dentry->d_inode = inode;
-	}
-
-	/* If this is a directory that isn't a mount point, bitch at the
-	   daemon and fix it in user space */
-	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-		return !autofs_wait(sbi, &dentry->d_name);
-	}
-
-	/* We don't update the usages for the autofs daemon itself, this
-	   is necessary for recursive autofs mounts */
-	if (!autofs_oz_mode(sbi)) {
-		autofs_update_usage(&sbi->dirhash,ent);
-	}
-
-	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-	return 1;
-}
-
-
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
-{
-	struct inode * dir;
-	struct autofs_sb_info *sbi;
-	struct autofs_dir_ent *ent;
-	int res;
-
-	lock_kernel();
-	dir = dentry->d_parent->d_inode;
-	sbi = autofs_sbi(dir->i_sb);
-
-	/* Pending dentry */
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-		if (autofs_oz_mode(sbi))
-			res = 1;
-		else
-			res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-		unlock_kernel();
-		return res;
-	}
-
-	/* Negative dentry.. invalidate if "old" */
-	if (!dentry->d_inode) {
-		unlock_kernel();
-		return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
-	}
-		
-	/* Check for a non-mountpoint directory */
-	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-		if (autofs_oz_mode(sbi))
-			res = 1;
-		else
-			res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-		unlock_kernel();
-		return res;
-	}
-
-	/* Update the usage list */
-	if (!autofs_oz_mode(sbi)) {
-		ent = (struct autofs_dir_ent *) dentry->d_time;
-		if (ent)
-			autofs_update_usage(&sbi->dirhash,ent);
-	}
-	unlock_kernel();
-	return 1;
-}
-
-static const struct dentry_operations autofs_dentry_operations = {
-	.d_revalidate	= autofs_revalidate,
-};
-
-static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	struct autofs_sb_info *sbi;
-	int oz_mode;
-
-	DPRINTK(("autofs_root_lookup: name = "));
-	lock_kernel();
-	autofs_say(dentry->d_name.name,dentry->d_name.len);
-
-	if (dentry->d_name.len > NAME_MAX) {
-		unlock_kernel();
-		return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
-	}
-
-	sbi = autofs_sbi(dir->i_sb);
-
-	oz_mode = autofs_oz_mode(sbi);
-	DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
-				"oz_mode = %d\n", task_pid_nr(current),
-				task_pgrp_nr(current), sbi->catatonic,
-				oz_mode));
-
-	/*
-	 * Mark the dentry incomplete, but add it. This is needed so
-	 * that the VFS layer knows about the dentry, and we can count
-	 * on catching any lookups through the revalidate.
-	 *
-	 * Let all the hard work be done by the revalidate function that
-	 * needs to be able to do this anyway..
-	 *
-	 * We need to do this before we release the directory semaphore.
-	 */
-	dentry->d_op = &autofs_dentry_operations;
-	dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-	d_add(dentry, NULL);
-
-	mutex_unlock(&dir->i_mutex);
-	autofs_revalidate(dentry, nd);
-	mutex_lock(&dir->i_mutex);
-
-	/*
-	 * If we are still pending, check if we had to handle
-	 * a signal. If so we can force a restart..
-	 */
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-		/* See if we were interrupted */
-		if (signal_pending(current)) {
-			sigset_t *sigset = &current->pending.signal;
-			if (sigismember (sigset, SIGKILL) ||
-			    sigismember (sigset, SIGQUIT) ||
-			    sigismember (sigset, SIGINT)) {
-				unlock_kernel();
-				return ERR_PTR(-ERESTARTNOINTR);
-			}
-		}
-	}
-	unlock_kernel();
-
-	/*
-	 * If this dentry is unhashed, then we shouldn't honour this
-	 * lookup even if the dentry is positive.  Returning ENOENT here
-	 * doesn't do the right thing for all system calls, but it should
-	 * be OK for the operations we permit from an autofs.
-	 */
-	if (dentry->d_inode && d_unhashed(dentry))
-		return ERR_PTR(-ENOENT);
-
-	return NULL;
-}
-
-static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	unsigned int n;
-	int slsize;
-	struct autofs_symlink *sl;
-	struct inode *inode;
-
-	DPRINTK(("autofs_root_symlink: %s <- ", symname));
-	autofs_say(dentry->d_name.name,dentry->d_name.len);
-
-	lock_kernel();
-	if (!autofs_oz_mode(sbi)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	if (autofs_hash_lookup(dh, &dentry->d_name)) {
-		unlock_kernel();
-		return -EEXIST;
-	}
-
-	n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
-	if (n >= AUTOFS_MAX_SYMLINKS) {
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	set_bit(n,sbi->symlink_bitmap);
-	sl = &sbi->symlink[n];
-	sl->len = strlen(symname);
-	sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
-	if (!sl->data) {
-		clear_bit(n,sbi->symlink_bitmap);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-	if (!ent) {
-		kfree(sl->data);
-		clear_bit(n,sbi->symlink_bitmap);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-	if (!ent->name) {
-		kfree(sl->data);
-		kfree(ent);
-		clear_bit(n,sbi->symlink_bitmap);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	memcpy(sl->data,symname,slsize);
-	sl->mtime = get_seconds();
-
-	ent->ino = AUTOFS_FIRST_SYMLINK + n;
-	ent->hash = dentry->d_name.hash;
-	memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-	ent->dentry = NULL;	/* We don't keep the dentry for symlinks */
-
-	autofs_hash_insert(dh,ent);
-
-	inode = autofs_iget(dir->i_sb, ent->ino);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	d_instantiate(dentry, inode);
-	unlock_kernel();
-	return 0;
-}
-
-/*
- * NOTE!
- *
- * Normal filesystems would do a "d_delete()" to tell the VFS dcache
- * that the file no longer exists. However, doing that means that the
- * VFS layer can turn the dentry into a negative dentry, which we
- * obviously do not want (we're dropping the entry not because it
- * doesn't exist, but because it has timed out).
- *
- * Also see autofs_root_rmdir()..
- */
-static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	unsigned int n;
-
-	/* This allows root to remove symlinks */
-	lock_kernel();
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if (!ent) {
-		unlock_kernel();
-		return -ENOENT;
-	}
-
-	n = ent->ino - AUTOFS_FIRST_SYMLINK;
-	if (n >= AUTOFS_MAX_SYMLINKS) {
-		unlock_kernel();
-		return -EISDIR;	/* It's a directory, dummy */
-	}
-	if (!test_bit(n,sbi->symlink_bitmap)) {
-		unlock_kernel();
-		return -EINVAL;	/* Nonexistent symlink?  Shouldn't happen */
-	}
-	
-	dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
-	autofs_hash_delete(ent);
-	clear_bit(n,sbi->symlink_bitmap);
-	kfree(sbi->symlink[n].data);
-	d_drop(dentry);
-	
-	unlock_kernel();
-	return 0;
-}
-
-static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-
-	lock_kernel();
-	if (!autofs_oz_mode(sbi)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if (!ent) {
-		unlock_kernel();
-		return -ENOENT;
-	}
-
-	if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
-		unlock_kernel();
-		return -ENOTDIR; /* Not a directory */
-	}
-
-	if (ent->dentry != dentry) {
-		printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
-	}
-
-	dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
-	autofs_hash_delete(ent);
-	drop_nlink(dir);
-	d_drop(dentry);
-	unlock_kernel();
-
-	return 0;
-}
-
-static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	struct inode *inode;
-	ino_t ino;
-
-	lock_kernel();
-	if (!autofs_oz_mode(sbi)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if (ent) {
-		unlock_kernel();
-		return -EEXIST;
-	}
-
-	if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
-		printk("autofs: Out of inode numbers -- what the heck did you do??\n");
-		unlock_kernel();
-		return -ENOSPC;
-	}
-	ino = sbi->next_dir_ino++;
-
-	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-	if (!ent) {
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-	if (!ent->name) {
-		kfree(ent);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent->hash = dentry->d_name.hash;
-	memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-	ent->ino = ino;
-	ent->dentry = dentry;
-	autofs_hash_insert(dh,ent);
-
-	inc_nlink(dir);
-
-	inode = autofs_iget(dir->i_sb, ino);
-	if (IS_ERR(inode)) {
-		drop_nlink(dir);
-		return PTR_ERR(inode);
-	}
-
-	d_instantiate(dentry, inode);
-	unlock_kernel();
-
-	return 0;
-}
-
-/* Get/set timeout ioctl() operation */
-#ifdef CONFIG_COMPAT
-static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
-					 unsigned int __user *p)
-{
-	unsigned long ntimeout;
-
-	if (get_user(ntimeout, p) ||
-	    put_user(sbi->exp_timeout / HZ, p))
-		return -EFAULT;
-
-	if (ntimeout > UINT_MAX/HZ)
-		sbi->exp_timeout = 0;
-	else
-		sbi->exp_timeout = ntimeout * HZ;
-
-	return 0;
-}
-#endif
-
-static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
-					 unsigned long __user *p)
-{
-	unsigned long ntimeout;
-
-	if (get_user(ntimeout, p) ||
-	    put_user(sbi->exp_timeout / HZ, p))
-		return -EFAULT;
-
-	if (ntimeout > ULONG_MAX/HZ)
-		sbi->exp_timeout = 0;
-	else
-		sbi->exp_timeout = ntimeout * HZ;
-
-	return 0;
-}
-
-/* Return protocol version */
-static inline int autofs_get_protover(int __user *p)
-{
-	return put_user(AUTOFS_PROTO_VERSION, p);
-}
-
-/* Perform an expiry operation */
-static inline int autofs_expire_run(struct super_block *sb,
-				    struct autofs_sb_info *sbi,
-				    struct vfsmount *mnt,
-				    struct autofs_packet_expire __user *pkt_p)
-{
-	struct autofs_dir_ent *ent;
-	struct autofs_packet_expire pkt;
-
-	memset(&pkt,0,sizeof pkt);
-
-	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-	pkt.hdr.type = autofs_ptype_expire;
-
-	if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
-		return -EAGAIN;
-
-	pkt.len = ent->len;
-	memcpy(pkt.name, ent->name, pkt.len);
-	pkt.name[pkt.len] = '\0';
-
-	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
-		return -EFAULT;
-
-	return 0;
-}
-
-/*
- * ioctl()'s on the root directory is the chief method for the daemon to
- * generate kernel reactions
- */
-static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
-	void __user *argp = (void __user *)arg;
-
-	DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
-
-	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
-		return -ENOTTY;
-	
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	
-	switch(cmd) {
-	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
-		return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
-	case AUTOFS_IOC_FAIL:	/* Wait queue: fail with ENOENT */
-		return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
-	case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
-		autofs_catatonic_mode(sbi);
-		return 0;
-	case AUTOFS_IOC_PROTOVER: /* Get protocol version */
-		return autofs_get_protover(argp);
-#ifdef CONFIG_COMPAT
-	case AUTOFS_IOC_SETTIMEOUT32:
-		return autofs_compat_get_set_timeout(sbi, argp);
-#endif
-	case AUTOFS_IOC_SETTIMEOUT:
-		return autofs_get_set_timeout(sbi, argp);
-	case AUTOFS_IOC_EXPIRE:
-		return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
-					 argp);
-	default:
-		return -ENOSYS;
-	}
-
-}
-
-static long autofs_root_ioctl(struct file *filp,
-			     unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	lock_kernel();
-	ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
-				   filp, cmd, arg);
-	unlock_kernel();
-
-	return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static long autofs_root_compat_ioctl(struct file *filp,
-			     unsigned int cmd, unsigned long arg)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int ret;
-
-	lock_kernel();
-	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
-		ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
-	else
-		ret = autofs_do_root_ioctl(inode, filp, cmd,
-			(unsigned long)compat_ptr(arg));
-	unlock_kernel();
-
-	return ret;
-}
-#endif
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce2..000000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include "autofs_i.h"
-
-/* Nothing to release.. */
-static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
-	nd_set_link(nd, s);
-	return NULL;
-}
-
-const struct inode_operations autofs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= autofs_follow_link
-};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f0..000000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/signal.h>
-#include <linux/file.h>
-#include "autofs_i.h"
-
-/* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
-static autofs_wqt_t autofs_next_wait_queue = 1;
-
-/* These are the signals we allow interrupting a pending mount */
-#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
-
-void autofs_catatonic_mode(struct autofs_sb_info *sbi)
-{
-	struct autofs_wait_queue *wq, *nwq;
-
-	DPRINTK(("autofs: entering catatonic mode\n"));
-
-	sbi->catatonic = 1;
-	wq = sbi->queues;
-	sbi->queues = NULL;	/* Erase all wait queues */
-	while ( wq ) {
-		nwq = wq->next;
-		wq->status = -ENOENT; /* Magic is gone - report failure */
-		kfree(wq->name);
-		wq->name = NULL;
-		wake_up(&wq->queue);
-		wq = nwq;
-	}
-	fput(sbi->pipe);	/* Close the pipe */
-	sbi->pipe = NULL;
-	autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
-}
-
-static int autofs_write(struct file *file, const void *addr, int bytes)
-{
-	unsigned long sigpipe, flags;
-	mm_segment_t fs;
-	const char *data = (const char *)addr;
-	ssize_t wr = 0;
-
-	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
-	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
-
-	/* Save pointer to user space and point back to kernel space */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	while (bytes &&
-	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
-		data += wr;
-		bytes -= wr;
-	}
-
-	set_fs(fs);
-
-	/* Keep the currently executing process from receiving a
-	   SIGPIPE unless it was already supposed to get one */
-	if (wr == -EPIPE && !sigpipe) {
-		spin_lock_irqsave(&current->sighand->siglock, flags);
-		sigdelset(&current->pending.signal, SIGPIPE);
-		recalc_sigpending();
-		spin_unlock_irqrestore(&current->sighand->siglock, flags);
-	}
-
-	return (bytes > 0);
-}
-	
-static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
-{
-	struct autofs_packet_missing pkt;
-
-	DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
-	autofs_say(wq->name,wq->len);
-
-	memset(&pkt,0,sizeof pkt); /* For security reasons */
-
-	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-	pkt.hdr.type = autofs_ptype_missing;
-	pkt.wait_queue_token = wq->wait_queue_token;
-	pkt.len = wq->len;
-        memcpy(pkt.name, wq->name, pkt.len);
-	pkt.name[pkt.len] = '\0';
-
-	if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
-		autofs_catatonic_mode(sbi);
-}
-
-int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
-{
-	struct autofs_wait_queue *wq;
-	int status;
-
-	/* In catatonic mode, we don't wait for nobody */
-	if ( sbi->catatonic )
-		return -ENOENT;
-	
-	/* We shouldn't be able to get here, but just in case */
-	if ( name->len > NAME_MAX )
-		return -ENOENT;
-
-	for ( wq = sbi->queues ; wq ; wq = wq->next ) {
-		if ( wq->hash == name->hash &&
-		     wq->len == name->len &&
-		     wq->name && !memcmp(wq->name,name->name,name->len) )
-			break;
-	}
-	
-	if ( !wq ) {
-		/* Create a new wait queue */
-		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
-		if ( !wq )
-			return -ENOMEM;
-
-		wq->name = kmalloc(name->len,GFP_KERNEL);
-		if ( !wq->name ) {
-			kfree(wq);
-			return -ENOMEM;
-		}
-		wq->wait_queue_token = autofs_next_wait_queue++;
-		init_waitqueue_head(&wq->queue);
-		wq->hash = name->hash;
-		wq->len = name->len;
-		wq->status = -EINTR; /* Status return if interrupted */
-		memcpy(wq->name, name->name, name->len);
-		wq->next = sbi->queues;
-		sbi->queues = wq;
-
-		/* autofs_notify_daemon() may block */
-		wq->wait_ctr = 2;
-		autofs_notify_daemon(sbi,wq);
-	} else
-		wq->wait_ctr++;
-
-	/* wq->name is NULL if and only if the lock is already released */
-
-	if ( sbi->catatonic ) {
-		/* We might have slept, so check again for catatonic mode */
-		wq->status = -ENOENT;
-		kfree(wq->name);
-		wq->name = NULL;
-	}
-
-	if ( wq->name ) {
-		/* Block all but "shutdown" signals while waiting */
-		sigset_t sigmask;
-
-		siginitsetinv(&sigmask, SHUTDOWN_SIGS);
-		sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
-
-		interruptible_sleep_on(&wq->queue);
-
-		sigprocmask(SIG_SETMASK, &sigmask, NULL);
-	} else {
-		DPRINTK(("autofs_wait: skipped sleeping\n"));
-	}
-
-	status = wq->status;
-
-	if ( ! --wq->wait_ctr )	/* Are we the last process to need status? */
-		kfree(wq);
-
-	return status;
-}
-
-
-int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
-{
-	struct autofs_wait_queue *wq, **wql;
-
-	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-		if ( wq->wait_queue_token == wait_queue_token )
-			break;
-	}
-	if ( !wq )
-		return -EINVAL;
-
-	*wql = wq->next;	/* Unlink from chain */
-	kfree(wq->name);
-	wq->name = NULL;	/* Do not wait on this queue */
-
-	wq->status = status;
-
-	if ( ! --wq->wait_ctr )	/* Is anyone still waiting for this guy? */
-		kfree(wq);
-	else
-		wake_up(&wq->queue);
-
-	return 0;
-}
-
-- 
cgit v1.2.3


From 2116b7a473bf1c8d26998b477c294e7fe294921f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 4 Oct 2010 22:55:57 +0200
Subject: smbfs: move to drivers/staging

smbfs has been scheduled for removal in 2.6.27, so
maybe we can now move it to drivers/staging on the
way out.

smbfs still uses the big kernel lock and nobody
is going to fix that, so we should be getting
rid of it soon.

This removes the 32 bit compat mount and ioctl
handling code, which is implemented in common fs
code, and moves all smbfs related files into
drivers/staging/smbfs.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/00-INDEX   |    2 -
 Documentation/filesystems/smbfs.txt  |    8 -
 Documentation/ioctl/ioctl-number.txt |    2 +-
 drivers/staging/Kconfig              |    2 +
 drivers/staging/Makefile             |    1 +
 drivers/staging/smbfs/Kconfig        |   55 +
 drivers/staging/smbfs/Makefile       |   18 +
 drivers/staging/smbfs/TODO           |    8 +
 drivers/staging/smbfs/cache.c        |  208 ++
 drivers/staging/smbfs/dir.c          |  702 +++++++
 drivers/staging/smbfs/file.c         |  453 +++++
 drivers/staging/smbfs/getopt.c       |   64 +
 drivers/staging/smbfs/getopt.h       |   14 +
 drivers/staging/smbfs/inode.c        |  839 ++++++++
 drivers/staging/smbfs/ioctl.c        |   68 +
 drivers/staging/smbfs/proc.c         | 3506 +++++++++++++++++++++++++++++++++
 drivers/staging/smbfs/proto.h        |   87 +
 drivers/staging/smbfs/request.c      |  817 ++++++++
 drivers/staging/smbfs/request.h      |   70 +
 drivers/staging/smbfs/smb.h          |  118 ++
 drivers/staging/smbfs/smb_debug.h    |   34 +
 drivers/staging/smbfs/smb_fs.h       |  153 ++
 drivers/staging/smbfs/smb_fs_i.h     |   37 +
 drivers/staging/smbfs/smb_fs_sb.h    |  100 +
 drivers/staging/smbfs/smb_mount.h    |   65 +
 drivers/staging/smbfs/smbfs.txt      |    8 +
 drivers/staging/smbfs/smbiod.c       |  343 ++++
 drivers/staging/smbfs/smbno.h        |  363 ++++
 drivers/staging/smbfs/sock.c         |  385 ++++
 drivers/staging/smbfs/symlink.c      |   67 +
 fs/Kconfig                           |    1 -
 fs/Makefile                          |    1 -
 fs/compat.c                          |   31 +-
 fs/compat_ioctl.c                    |   26 -
 fs/smbfs/Kconfig                     |   55 -
 fs/smbfs/Makefile                    |   18 -
 fs/smbfs/cache.c                     |  208 --
 fs/smbfs/dir.c                       |  702 -------
 fs/smbfs/file.c                      |  454 -----
 fs/smbfs/getopt.c                    |   64 -
 fs/smbfs/getopt.h                    |   14 -
 fs/smbfs/inode.c                     |  839 --------
 fs/smbfs/ioctl.c                     |   69 -
 fs/smbfs/proc.c                      | 3507 ----------------------------------
 fs/smbfs/proto.h                     |   87 -
 fs/smbfs/request.c                   |  818 --------
 fs/smbfs/request.h                   |   70 -
 fs/smbfs/smb_debug.h                 |   34 -
 fs/smbfs/smbiod.c                    |  344 ----
 fs/smbfs/sock.c                      |  386 ----
 fs/smbfs/symlink.c                   |   68 -
 include/linux/Kbuild                 |    4 -
 include/linux/smb.h                  |  118 --
 include/linux/smb_fs.h               |  153 --
 include/linux/smb_fs_i.h             |   37 -
 include/linux/smb_fs_sb.h            |  100 -
 include/linux/smb_mount.h            |   65 -
 include/linux/smbno.h                |  363 ----
 58 files changed, 8587 insertions(+), 8646 deletions(-)
 delete mode 100644 Documentation/filesystems/smbfs.txt
 create mode 100644 drivers/staging/smbfs/Kconfig
 create mode 100644 drivers/staging/smbfs/Makefile
 create mode 100644 drivers/staging/smbfs/TODO
 create mode 100644 drivers/staging/smbfs/cache.c
 create mode 100644 drivers/staging/smbfs/dir.c
 create mode 100644 drivers/staging/smbfs/file.c
 create mode 100644 drivers/staging/smbfs/getopt.c
 create mode 100644 drivers/staging/smbfs/getopt.h
 create mode 100644 drivers/staging/smbfs/inode.c
 create mode 100644 drivers/staging/smbfs/ioctl.c
 create mode 100644 drivers/staging/smbfs/proc.c
 create mode 100644 drivers/staging/smbfs/proto.h
 create mode 100644 drivers/staging/smbfs/request.c
 create mode 100644 drivers/staging/smbfs/request.h
 create mode 100644 drivers/staging/smbfs/smb.h
 create mode 100644 drivers/staging/smbfs/smb_debug.h
 create mode 100644 drivers/staging/smbfs/smb_fs.h
 create mode 100644 drivers/staging/smbfs/smb_fs_i.h
 create mode 100644 drivers/staging/smbfs/smb_fs_sb.h
 create mode 100644 drivers/staging/smbfs/smb_mount.h
 create mode 100644 drivers/staging/smbfs/smbfs.txt
 create mode 100644 drivers/staging/smbfs/smbiod.c
 create mode 100644 drivers/staging/smbfs/smbno.h
 create mode 100644 drivers/staging/smbfs/sock.c
 create mode 100644 drivers/staging/smbfs/symlink.c
 delete mode 100644 fs/smbfs/Kconfig
 delete mode 100644 fs/smbfs/Makefile
 delete mode 100644 fs/smbfs/cache.c
 delete mode 100644 fs/smbfs/dir.c
 delete mode 100644 fs/smbfs/file.c
 delete mode 100644 fs/smbfs/getopt.c
 delete mode 100644 fs/smbfs/getopt.h
 delete mode 100644 fs/smbfs/inode.c
 delete mode 100644 fs/smbfs/ioctl.c
 delete mode 100644 fs/smbfs/proc.c
 delete mode 100644 fs/smbfs/proto.h
 delete mode 100644 fs/smbfs/request.c
 delete mode 100644 fs/smbfs/request.h
 delete mode 100644 fs/smbfs/smb_debug.h
 delete mode 100644 fs/smbfs/smbiod.c
 delete mode 100644 fs/smbfs/sock.c
 delete mode 100644 fs/smbfs/symlink.c
 delete mode 100644 include/linux/smb.h
 delete mode 100644 include/linux/smb_fs.h
 delete mode 100644 include/linux/smb_fs_i.h
 delete mode 100644 include/linux/smb_fs_sb.h
 delete mode 100644 include/linux/smb_mount.h
 delete mode 100644 include/linux/smbno.h

(limited to 'fs')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 4303614b5add..8c624a18f67d 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -96,8 +96,6 @@ seq_file.txt
 	- how to use the seq_file API
 sharedsubtree.txt
 	- a description of shared subtrees for namespaces.
-smbfs.txt
-	- info on using filesystems with the SMB protocol (Win 3.11 and NT).
 spufs.txt
 	- info and mount options for the SPU filesystem used on Cell.
 sysfs-pci.txt
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
deleted file mode 100644
index 194fb0decd2c..000000000000
--- a/Documentation/filesystems/smbfs.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Smbfs is a filesystem that implements the SMB protocol, which is the
-protocol used by Windows for Workgroups, Windows 95 and Windows NT.
-Smbfs was inspired by Samba, the program written by Andrew Tridgell
-that turns any Unix host into a file server for DOS or Windows clients.
-
-Smbfs is a SMB client, but uses parts of samba for its operation. For
-more info on samba, including documentation, please go to
-http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 33223ff121d8..d15834a6e461 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -259,7 +259,7 @@ Code  Seq#(hex)	Include File		Comments
 't'	00-7F	linux/if_ppp.h
 't'	80-8F	linux/isdn_ppp.h
 't'	90	linux/toshiba.h
-'u'	00-1F	linux/smb_fs.h
+'u'	00-1F	linux/smb_fs.h		gone
 'v'	all	linux/videodev.h	conflict!
 'v'	00-1F	linux/ext2_fs.h		conflict!
 'v'	00-1F	linux/fs.h		conflict!
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 01503536e457..4a9190808b73 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -149,6 +149,8 @@ source "drivers/staging/msm/Kconfig"
 
 source "drivers/staging/lirc/Kconfig"
 
+source "drivers/staging/smbfs/Kconfig"
+
 source "drivers/staging/easycap/Kconfig"
 
 source "drivers/staging/solo6x10/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index de5c0e5a99ed..7cb02a8e9f77 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_IDE_PHISON)	+= phison/
 obj-$(CONFIG_LINE6_USB)		+= line6/
 obj-$(CONFIG_USB_SERIAL_QUATECH2)	+= serqt_usb2/
+obj-$(CONFIG_SMB_FS)		+= smbfs/
 obj-$(CONFIG_USB_SERIAL_QUATECH_USB2)	+= quatech_usb2/
 obj-$(CONFIG_OCTEON_ETHERNET)	+= octeon/
 obj-$(CONFIG_VT6655)		+= vt6655/
diff --git a/drivers/staging/smbfs/Kconfig b/drivers/staging/smbfs/Kconfig
new file mode 100644
index 000000000000..e668127c8b2e
--- /dev/null
+++ b/drivers/staging/smbfs/Kconfig
@@ -0,0 +1,55 @@
+config SMB_FS
+	tristate "SMB file system support (OBSOLETE, please use CIFS)"
+	depends on INET
+	select NLS
+	help
+	  SMB (Server Message Block) is the protocol Windows for Workgroups
+	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
+	  files and printers over local networks.  Saying Y here allows you to
+	  mount their file systems (often called "shares" in this context) and
+	  access them just like any other Unix directory.  Currently, this
+	  works only if the Windows machines use TCP/IP as the underlying
+	  transport protocol, and not NetBEUI.  For details, read
+	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>.
+
+	  Note: if you just want your box to act as an SMB *server* and make
+	  files and printing services available to Windows clients (which need
+	  to have a TCP/IP stack), you don't need to say Y here; you can use
+	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
+	  for that.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile the SMB support as a module, choose M here:
+	  the module will be called smbfs.  Most people say N, however.
+
+config SMB_NLS_DEFAULT
+	bool "Use a default NLS"
+	depends on SMB_FS
+	help
+	  Enabling this will make smbfs use nls translations by default. You
+	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
+	  settings and you need to give the default nls for the SMB server as
+	  CONFIG_SMB_NLS_REMOTE.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
+
+config SMB_NLS_REMOTE
+	string "Default Remote NLS Option"
+	depends on SMB_NLS_DEFAULT
+	default "cp437"
+	help
+	  This setting allows you to specify a default value for which
+	  codepage the server uses. If this field is left blank no
+	  translations will be done by default. The local codepage/charset
+	  default to CONFIG_NLS_DEFAULT.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
diff --git a/drivers/staging/smbfs/Makefile b/drivers/staging/smbfs/Makefile
new file mode 100644
index 000000000000..4faf8c4722c3
--- /dev/null
+++ b/drivers/staging/smbfs/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for the linux smb-filesystem routines.
+#
+
+obj-$(CONFIG_SMB_FS) += smbfs.o
+
+smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
+		symlink.o smbiod.o request.o
+
+# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
+# SMBFS_PARANOIA should normally be enabled.
+
+EXTRA_CFLAGS += -DSMBFS_PARANOIA
+#EXTRA_CFLAGS += -DSMBFS_DEBUG
+#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
+#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
+#EXTRA_CFLAGS += -Werror
+
diff --git a/drivers/staging/smbfs/TODO b/drivers/staging/smbfs/TODO
new file mode 100644
index 000000000000..24f4d29d53ac
--- /dev/null
+++ b/drivers/staging/smbfs/TODO
@@ -0,0 +1,8 @@
+smbfs is on its way out of the kernel, it has been replaced
+by cifs several years ago.
+
+The smbfs code uses the big kernel lock which
+is getting deprecated.
+
+Users that find smbfs to work but not cifs should contact
+the CIFS developers on linux-cifs@vger.kernel.org.
diff --git a/drivers/staging/smbfs/cache.c b/drivers/staging/smbfs/cache.c
new file mode 100644
index 000000000000..dbb98658148b
--- /dev/null
+++ b/drivers/staging/smbfs/cache.c
@@ -0,0 +1,208 @@
+/*
+ *  cache.c
+ *
+ * Copyright (C) 1997 by Bill Hawes
+ *
+ * Routines to support directory cacheing using the page cache.
+ * This cache code is almost directly taken from ncpfs.
+ *
+ * Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/net.h>
+
+#include <asm/page.h>
+
+#include "smb_fs.h"
+#include "smb_debug.h"
+#include "proto.h"
+
+/*
+ * Force the next attempt to use the cache to be a timeout.
+ * If we can't find the page that's fine, it will cause a refresh.
+ */
+void
+smb_invalid_dir_cache(struct inode * dir)
+{
+	struct smb_sb_info *server = server_from_inode(dir);
+	union  smb_dir_cache *cache = NULL;
+	struct page *page = NULL;
+
+	page = grab_cache_page(&dir->i_data, 0);
+	if (!page)
+		goto out;
+
+	if (!PageUptodate(page))
+		goto out_unlock;
+
+	cache = kmap(page);
+	cache->head.time = jiffies - SMB_MAX_AGE(server);
+
+	kunmap(page);
+	SetPageUptodate(page);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return;
+}
+
+/*
+ * Mark all dentries for 'parent' as invalid, forcing them to be re-read
+ */
+void
+smb_invalidate_dircache_entries(struct dentry *parent)
+{
+	struct smb_sb_info *server = server_from_dentry(parent);
+	struct list_head *next;
+	struct dentry *dentry;
+
+	spin_lock(&dcache_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dentry = list_entry(next, struct dentry, d_u.d_child);
+		dentry->d_fsdata = NULL;
+		smb_age_dentry(server, dentry);
+		next = next->next;
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * dget, but require that fpos and parent matches what the dentry contains.
+ * dentry is not known to be a valid pointer at entry.
+ */
+struct dentry *
+smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
+{
+	struct dentry *dent = dentry;
+	struct list_head *next;
+
+	if (d_validate(dent, parent)) {
+		if (dent->d_name.len <= SMB_MAXNAMELEN &&
+		    (unsigned long)dent->d_fsdata == fpos) {
+			if (!dent->d_inode) {
+				dput(dent);
+				dent = NULL;
+			}
+			return dent;
+		}
+		dput(dent);
+	}
+
+	/* If a pointer is invalid, we search the dentry. */
+	spin_lock(&dcache_lock);
+	next = parent->d_subdirs.next;
+	while (next != &parent->d_subdirs) {
+		dent = list_entry(next, struct dentry, d_u.d_child);
+		if ((unsigned long)dent->d_fsdata == fpos) {
+			if (dent->d_inode)
+				dget_locked(dent);
+			else
+				dent = NULL;
+			goto out_unlock;
+		}
+		next = next->next;
+	}
+	dent = NULL;
+out_unlock:
+	spin_unlock(&dcache_lock);
+	return dent;
+}
+
+
+/*
+ * Create dentry/inode for this file and add it to the dircache.
+ */
+int
+smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	       struct smb_cache_control *ctrl, struct qstr *qname,
+	       struct smb_fattr *entry)
+{
+	struct dentry *newdent, *dentry = filp->f_path.dentry;
+	struct inode *newino, *inode = dentry->d_inode;
+	struct smb_cache_control ctl = *ctrl;
+	int valid = 0;
+	int hashed = 0;
+	ino_t ino = 0;
+
+	qname->hash = full_name_hash(qname->name, qname->len);
+
+	if (dentry->d_op && dentry->d_op->d_hash)
+		if (dentry->d_op->d_hash(dentry, qname) != 0)
+			goto end_advance;
+
+	newdent = d_lookup(dentry, qname);
+
+	if (!newdent) {
+		newdent = d_alloc(dentry, qname);
+		if (!newdent)
+			goto end_advance;
+	} else {
+		hashed = 1;
+		memcpy((char *) newdent->d_name.name, qname->name,
+		       newdent->d_name.len);
+	}
+
+	if (!newdent->d_inode) {
+		smb_renew_times(newdent);
+		entry->f_ino = iunique(inode->i_sb, 2);
+		newino = smb_iget(inode->i_sb, entry);
+		if (newino) {
+			smb_new_dentry(newdent);
+			d_instantiate(newdent, newino);
+			if (!hashed)
+				d_rehash(newdent);
+		}
+	} else
+		smb_set_inode_attr(newdent->d_inode, entry);
+
+        if (newdent->d_inode) {
+		ino = newdent->d_inode->i_ino;
+		newdent->d_fsdata = (void *) ctl.fpos;
+		smb_new_dentry(newdent);
+	}
+
+	if (ctl.idx >= SMB_DIRCACHE_SIZE) {
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+		}
+		ctl.cache = NULL;
+		ctl.idx  -= SMB_DIRCACHE_SIZE;
+		ctl.ofs  += 1;
+		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
+		if (ctl.page)
+			ctl.cache = kmap(ctl.page);
+	}
+	if (ctl.cache) {
+		ctl.cache->dentry[ctl.idx] = newdent;
+		valid = 1;
+	}
+	dput(newdent);
+
+end_advance:
+	if (!valid)
+		ctl.valid = 0;
+	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+		if (!ino)
+			ino = find_inode_number(dentry, qname);
+		if (!ino)
+			ino = iunique(inode->i_sb, 2);
+		ctl.filled = filldir(dirent, qname->name, qname->len,
+				     filp->f_pos, ino, DT_UNKNOWN);
+		if (!ctl.filled)
+			filp->f_pos += 1;
+	}
+	ctl.fpos += 1;
+	ctl.idx  += 1;
+	*ctrl = ctl;
+	return (ctl.valid || !ctl.filled);
+}
diff --git a/drivers/staging/smbfs/dir.c b/drivers/staging/smbfs/dir.c
new file mode 100644
index 000000000000..936b3bb2099c
--- /dev/null
+++ b/drivers/staging/smbfs/dir.c
@@ -0,0 +1,702 @@
+/*
+ *  dir.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/smp_lock.h>
+#include <linux/ctype.h>
+#include <linux/net.h>
+#include <linux/sched.h>
+
+#include "smb_fs.h"
+#include "smb_mount.h"
+#include "smbno.h"
+
+#include "smb_debug.h"
+#include "proto.h"
+
+static int smb_readdir(struct file *, void *, filldir_t);
+static int smb_dir_open(struct inode *, struct file *);
+
+static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
+static int smb_mkdir(struct inode *, struct dentry *, int);
+static int smb_rmdir(struct inode *, struct dentry *);
+static int smb_unlink(struct inode *, struct dentry *);
+static int smb_rename(struct inode *, struct dentry *,
+		      struct inode *, struct dentry *);
+static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
+static int smb_link(struct dentry *, struct inode *, struct dentry *);
+
+const struct file_operations smb_dir_operations =
+{
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= smb_readdir,
+	.unlocked_ioctl	= smb_ioctl,
+	.open		= smb_dir_open,
+};
+
+const struct inode_operations smb_dir_inode_operations =
+{
+	.create		= smb_create,
+	.lookup		= smb_lookup,
+	.unlink		= smb_unlink,
+	.mkdir		= smb_mkdir,
+	.rmdir		= smb_rmdir,
+	.rename		= smb_rename,
+	.getattr	= smb_getattr,
+	.setattr	= smb_notify_change,
+};
+
+const struct inode_operations smb_dir_inode_operations_unix =
+{
+	.create		= smb_create,
+	.lookup		= smb_lookup,
+	.unlink		= smb_unlink,
+	.mkdir		= smb_mkdir,
+	.rmdir		= smb_rmdir,
+	.rename		= smb_rename,
+	.getattr	= smb_getattr,
+	.setattr	= smb_notify_change,
+	.symlink	= smb_symlink,
+	.mknod		= smb_make_node,
+	.link		= smb_link,
+};
+
+/*
+ * Read a directory, using filldir to fill the dirent memory.
+ * smb_proc_readdir does the actual reading from the smb server.
+ *
+ * The cache code is almost directly taken from ncpfs
+ */
+static int 
+smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *dir = dentry->d_inode;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	union  smb_dir_cache *cache = NULL;
+	struct smb_cache_control ctl;
+	struct page *page = NULL;
+	int result;
+
+	ctl.page  = NULL;
+	ctl.cache = NULL;
+
+	VERBOSE("reading %s/%s, f_pos=%d\n",
+		DENTRY_PATH(dentry),  (int) filp->f_pos);
+
+	result = 0;
+
+	lock_kernel();
+
+	switch ((unsigned int) filp->f_pos) {
+	case 0:
+		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos = 1;
+		/* fallthrough */
+	case 1:
+		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
+			goto out;
+		filp->f_pos = 2;
+	}
+
+	/*
+	 * Make sure our inode is up-to-date.
+	 */
+	result = smb_revalidate_inode(dentry);
+	if (result)
+		goto out;
+
+
+	page = grab_cache_page(&dir->i_data, 0);
+	if (!page)
+		goto read_really;
+
+	ctl.cache = cache = kmap(page);
+	ctl.head  = cache->head;
+
+	if (!PageUptodate(page) || !ctl.head.eof) {
+		VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
+			 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
+		goto init_cache;
+	}
+
+	if (filp->f_pos == 2) {
+		if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
+			goto init_cache;
+
+		/*
+		 * N.B. ncpfs checks mtime of dentry too here, we don't.
+		 *   1. common smb servers do not update mtime on dir changes
+		 *   2. it requires an extra smb request
+		 *      (revalidate has the same timeout as ctl.head.time)
+		 *
+		 * Instead smbfs invalidates its own cache on local changes
+		 * and remote changes are not seen until timeout.
+		 */
+	}
+
+	if (filp->f_pos > ctl.head.end)
+		goto finished;
+
+	ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
+	ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
+	ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
+
+	for (;;) {
+		if (ctl.ofs != 0) {
+			ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
+			if (!ctl.page)
+				goto invalid_cache;
+			ctl.cache = kmap(ctl.page);
+			if (!PageUptodate(ctl.page))
+				goto invalid_cache;
+		}
+		while (ctl.idx < SMB_DIRCACHE_SIZE) {
+			struct dentry *dent;
+			int res;
+
+			dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
+					     dentry, filp->f_pos);
+			if (!dent)
+				goto invalid_cache;
+
+			res = filldir(dirent, dent->d_name.name,
+				      dent->d_name.len, filp->f_pos,
+				      dent->d_inode->i_ino, DT_UNKNOWN);
+			dput(dent);
+			if (res)
+				goto finished;
+			filp->f_pos += 1;
+			ctl.idx += 1;
+			if (filp->f_pos > ctl.head.end)
+				goto finished;
+		}
+		if (ctl.page) {
+			kunmap(ctl.page);
+			SetPageUptodate(ctl.page);
+			unlock_page(ctl.page);
+			page_cache_release(ctl.page);
+			ctl.page = NULL;
+		}
+		ctl.idx  = 0;
+		ctl.ofs += 1;
+	}
+invalid_cache:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+		ctl.page = NULL;
+	}
+	ctl.cache = cache;
+init_cache:
+	smb_invalidate_dircache_entries(dentry);
+	ctl.head.time = jiffies;
+	ctl.head.eof = 0;
+	ctl.fpos = 2;
+	ctl.ofs = 0;
+	ctl.idx = SMB_DIRCACHE_START;
+	ctl.filled = 0;
+	ctl.valid  = 1;
+read_really:
+	result = server->ops->readdir(filp, dirent, filldir, &ctl);
+	if (result == -ERESTARTSYS && page)
+		ClearPageUptodate(page);
+	if (ctl.idx == -1)
+		goto invalid_cache;	/* retry */
+	ctl.head.end = ctl.fpos - 1;
+	ctl.head.eof = ctl.valid;
+finished:
+	if (page) {
+		cache->head = ctl.head;
+		kunmap(page);
+		if (result != -ERESTARTSYS)
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (ctl.page) {
+		kunmap(ctl.page);
+		SetPageUptodate(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+	}
+out:
+	unlock_kernel();
+	return result;
+}
+
+static int
+smb_dir_open(struct inode *dir, struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct smb_sb_info *server;
+	int error = 0;
+
+	VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name);
+
+	/*
+	 * Directory timestamps in the core protocol aren't updated
+	 * when a file is added, so we give them a very short TTL.
+	 */
+	lock_kernel();
+	server = server_from_dentry(dentry);
+	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
+		unsigned long age = jiffies - SMB_I(dir)->oldmtime;
+		if (age > 2*HZ)
+			smb_invalid_dir_cache(dir);
+	}
+
+	/*
+	 * Note: in order to allow the smbmount process to open the
+	 * mount point, we only revalidate if the connection is valid or
+	 * if the process is trying to access something other than the root.
+	 */
+	if (server->state == CONN_VALID || !IS_ROOT(dentry))
+		error = smb_revalidate_inode(dentry);
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * Dentry operations routines
+ */
+static int smb_lookup_validate(struct dentry *, struct nameidata *);
+static int smb_hash_dentry(struct dentry *, struct qstr *);
+static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int smb_delete_dentry(struct dentry *);
+
+static const struct dentry_operations smbfs_dentry_operations =
+{
+	.d_revalidate	= smb_lookup_validate,
+	.d_hash		= smb_hash_dentry,
+	.d_compare	= smb_compare_dentry,
+	.d_delete	= smb_delete_dentry,
+};
+
+static const struct dentry_operations smbfs_dentry_operations_case =
+{
+	.d_revalidate	= smb_lookup_validate,
+	.d_delete	= smb_delete_dentry,
+};
+
+
+/*
+ * This is the callback when the dcache has a lookup hit.
+ */
+static int
+smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	struct inode * inode = dentry->d_inode;
+	unsigned long age = jiffies - dentry->d_time;
+	int valid;
+
+	/*
+	 * The default validation is based on dentry age:
+	 * we believe in dentries for a few seconds.  (But each
+	 * successful server lookup renews the timestamp.)
+	 */
+	valid = (age <= SMB_MAX_AGE(server));
+#ifdef SMBFS_DEBUG_VERBOSE
+	if (!valid)
+		VERBOSE("%s/%s not valid, age=%lu\n", 
+			DENTRY_PATH(dentry), age);
+#endif
+
+	if (inode) {
+		lock_kernel();
+		if (is_bad_inode(inode)) {
+			PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
+			valid = 0;
+		} else if (!valid)
+			valid = (smb_revalidate_inode(dentry) == 0);
+		unlock_kernel();
+	} else {
+		/*
+		 * What should we do for negative dentries?
+		 */
+	}
+	return valid;
+}
+
+static int 
+smb_hash_dentry(struct dentry *dir, struct qstr *this)
+{
+	unsigned long hash;
+	int i;
+
+	hash = init_name_hash();
+	for (i=0; i < this->len ; i++)
+		hash = partial_name_hash(tolower(this->name[i]), hash);
+	this->hash = end_name_hash(hash);
+  
+	return 0;
+}
+
+static int
+smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
+{
+	int i, result = 1;
+
+	if (a->len != b->len)
+		goto out;
+	for (i=0; i < a->len; i++) {
+		if (tolower(a->name[i]) != tolower(b->name[i]))
+			goto out;
+	}
+	result = 0;
+out:
+	return result;
+}
+
+/*
+ * This is the callback from dput() when d_count is going to 0.
+ * We use this to unhash dentries with bad inodes.
+ */
+static int
+smb_delete_dentry(struct dentry * dentry)
+{
+	if (dentry->d_inode) {
+		if (is_bad_inode(dentry->d_inode)) {
+			PARANOIA("bad inode, unhashing %s/%s\n",
+				 DENTRY_PATH(dentry));
+			return 1;
+		}
+	} else {
+		/* N.B. Unhash negative dentries? */
+	}
+	return 0;
+}
+
+/*
+ * Initialize a new dentry
+ */
+void
+smb_new_dentry(struct dentry *dentry)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+
+	if (server->mnt->flags & SMB_MOUNT_CASE)
+		dentry->d_op = &smbfs_dentry_operations_case;
+	else
+		dentry->d_op = &smbfs_dentry_operations;
+	dentry->d_time = jiffies;
+}
+
+
+/*
+ * Whenever a lookup succeeds, we know the parent directories
+ * are all valid, so we want to update the dentry timestamps.
+ * N.B. Move this to dcache?
+ */
+void
+smb_renew_times(struct dentry * dentry)
+{
+	dget(dentry);
+	spin_lock(&dentry->d_lock);
+	for (;;) {
+		struct dentry *parent;
+
+		dentry->d_time = jiffies;
+		if (IS_ROOT(dentry))
+			break;
+		parent = dentry->d_parent;
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		dput(dentry);
+		dentry = parent;
+		spin_lock(&dentry->d_lock);
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+}
+
+static struct dentry *
+smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct smb_fattr finfo;
+	struct inode *inode;
+	int error;
+	struct smb_sb_info *server;
+
+	error = -ENAMETOOLONG;
+	if (dentry->d_name.len > SMB_MAXNAMELEN)
+		goto out;
+
+	/* Do not allow lookup of names with backslashes in */
+	error = -EINVAL;
+	if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
+		goto out;
+
+	lock_kernel();
+	error = smb_proc_getattr(dentry, &finfo);
+#ifdef SMBFS_PARANOIA
+	if (error && error != -ENOENT)
+		PARANOIA("find %s/%s failed, error=%d\n",
+			 DENTRY_PATH(dentry), error);
+#endif
+
+	inode = NULL;
+	if (error == -ENOENT)
+		goto add_entry;
+	if (!error) {
+		error = -EACCES;
+		finfo.f_ino = iunique(dentry->d_sb, 2);
+		inode = smb_iget(dir->i_sb, &finfo);
+		if (inode) {
+	add_entry:
+			server = server_from_dentry(dentry);
+			if (server->mnt->flags & SMB_MOUNT_CASE)
+				dentry->d_op = &smbfs_dentry_operations_case;
+			else
+				dentry->d_op = &smbfs_dentry_operations;
+
+			d_add(dentry, inode);
+			smb_renew_times(dentry);
+			error = 0;
+		}
+	}
+	unlock_kernel();
+out:
+	return ERR_PTR(error);
+}
+
+/*
+ * This code is common to all routines creating a new inode.
+ */
+static int
+smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	struct inode *inode;
+	int error;
+	struct smb_fattr fattr;
+
+	VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
+
+	error = smb_proc_getattr(dentry, &fattr);
+	if (error)
+		goto out_close;
+
+	smb_renew_times(dentry);
+	fattr.f_ino = iunique(dentry->d_sb, 2);
+	inode = smb_iget(dentry->d_sb, &fattr);
+	if (!inode)
+		goto out_no_inode;
+
+	if (have_id) {
+		struct smb_inode_info *ei = SMB_I(inode);
+		ei->fileid = fileid;
+		ei->access = SMB_O_RDWR;
+		ei->open = server->generation;
+	}
+	d_instantiate(dentry, inode);
+out:
+	return error;
+
+out_no_inode:
+	error = -EACCES;
+out_close:
+	if (have_id) {
+		PARANOIA("%s/%s failed, error=%d, closing %u\n",
+			 DENTRY_PATH(dentry), error, fileid);
+		smb_close_fileid(dentry, fileid);
+	}
+	goto out;
+}
+
+/* N.B. How should the mode argument be used? */
+static int
+smb_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	__u16 fileid;
+	int error;
+	struct iattr attr;
+
+	VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
+
+	lock_kernel();
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
+	if (!error) {
+		if (server->opt.capabilities & SMB_CAP_UNIX) {
+			/* Set attributes for new file */
+			attr.ia_valid = ATTR_MODE;
+			attr.ia_mode = mode;
+			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
+		}
+		error = smb_instantiate(dentry, fileid, 1);
+	} else {
+		PARANOIA("%s/%s failed, error=%d\n",
+			 DENTRY_PATH(dentry), error);
+	}
+	unlock_kernel();
+	return error;
+}
+
+/* N.B. How should the mode argument be used? */
+static int
+smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int error;
+	struct iattr attr;
+
+	lock_kernel();
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_mkdir(dentry);
+	if (!error) {
+		if (server->opt.capabilities & SMB_CAP_UNIX) {
+			/* Set attributes for new directory */
+			attr.ia_valid = ATTR_MODE;
+			attr.ia_mode = mode;
+			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
+		}
+		error = smb_instantiate(dentry, 0, 0);
+	}
+	unlock_kernel();
+	return error;
+}
+
+static int
+smb_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	/*
+	 * Close the directory if it's open.
+	 */
+	lock_kernel();
+	smb_close(inode);
+
+	/*
+	 * Check that nobody else is using the directory..
+	 */
+	error = -EBUSY;
+	if (!d_unhashed(dentry))
+		goto out;
+
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_rmdir(dentry);
+
+out:
+	unlock_kernel();
+	return error;
+}
+
+static int
+smb_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+
+	/*
+	 * Close the file if it's open.
+	 */
+	lock_kernel();
+	smb_close(dentry->d_inode);
+
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_unlink(dentry);
+	if (!error)
+		smb_renew_times(dentry);
+	unlock_kernel();
+	return error;
+}
+
+static int
+smb_rename(struct inode *old_dir, struct dentry *old_dentry,
+	   struct inode *new_dir, struct dentry *new_dentry)
+{
+	int error;
+
+	/*
+	 * Close any open files, and check whether to delete the
+	 * target before attempting the rename.
+	 */
+	lock_kernel();
+	if (old_dentry->d_inode)
+		smb_close(old_dentry->d_inode);
+	if (new_dentry->d_inode) {
+		smb_close(new_dentry->d_inode);
+		error = smb_proc_unlink(new_dentry);
+		if (error) {
+			VERBOSE("unlink %s/%s, error=%d\n",
+				DENTRY_PATH(new_dentry), error);
+			goto out;
+		}
+		/* FIXME */
+		d_delete(new_dentry);
+	}
+
+	smb_invalid_dir_cache(old_dir);
+	smb_invalid_dir_cache(new_dir);
+	error = smb_proc_mv(old_dentry, new_dentry);
+	if (!error) {
+		smb_renew_times(old_dentry);
+		smb_renew_times(new_dentry);
+	}
+out:
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * FIXME: samba servers won't let you create device nodes unless uid/gid
+ * matches the connection credentials (and we don't know which those are ...)
+ */
+static int
+smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	int error;
+	struct iattr attr;
+
+	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
+	attr.ia_mode = mode;
+	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
+
+	if (!new_valid_dev(dev))
+		return -EINVAL;
+
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
+	if (!error) {
+		error = smb_instantiate(dentry, 0, 0);
+	}
+	return error;
+}
+
+/*
+ * dentry = existing file
+ * new_dentry = new file
+ */
+static int
+smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
+{
+	int error;
+
+	DEBUG1("smb_link old=%s/%s new=%s/%s\n",
+	       DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
+	smb_invalid_dir_cache(dir);
+	error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
+	if (!error) {
+		smb_renew_times(dentry);
+		error = smb_instantiate(new_dentry, 0, 0);
+	}
+	return error;
+}
diff --git a/drivers/staging/smbfs/file.c b/drivers/staging/smbfs/file.c
new file mode 100644
index 000000000000..5dcd19c60eb9
--- /dev/null
+++ b/drivers/staging/smbfs/file.c
@@ -0,0 +1,453 @@
+/*
+ *  file.c
+ *
+ *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/net.h>
+#include <linux/aio.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "smbno.h"
+#include "smb_fs.h"
+#include "smb_debug.h"
+#include "proto.h"
+
+static int
+smb_fsync(struct file *file, int datasync)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int result;
+
+	VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
+
+	/*
+	 * The VFS will writepage() all dirty pages for us, but we
+	 * should send a SMBflush to the server, letting it know that
+	 * we want things synchronized with actual storage.
+	 *
+	 * Note: this function requires all pages to have been written already
+	 *       (should be ok with writepage_sync)
+	 */
+	result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
+	return result;
+}
+
+/*
+ * Read a page synchronously.
+ */
+static int
+smb_readpage_sync(struct dentry *dentry, struct page *page)
+{
+	char *buffer = kmap(page);
+	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	unsigned int rsize = smb_get_rsize(server);
+	int count = PAGE_SIZE;
+	int result;
+
+	VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
+		DENTRY_PATH(dentry), count, offset, rsize);
+
+	result = smb_open(dentry, SMB_O_RDONLY);
+	if (result < 0)
+		goto io_error;
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
+		if (result < 0)
+			goto io_error;
+
+		count -= result;
+		offset += result;
+		buffer += result;
+		dentry->d_inode->i_atime =
+			current_fs_time(dentry->d_inode->i_sb);
+		if (result < rsize)
+			break;
+	} while (count);
+
+	memset(buffer, 0, count);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	result = 0;
+
+io_error:
+	kunmap(page);
+	unlock_page(page);
+	return result;
+}
+
+/*
+ * We are called with the page locked and we unlock it when done.
+ */
+static int
+smb_readpage(struct file *file, struct page *page)
+{
+	int		error;
+	struct dentry  *dentry = file->f_path.dentry;
+
+	page_cache_get(page);
+	error = smb_readpage_sync(dentry, page);
+	page_cache_release(page);
+	return error;
+}
+
+/*
+ * Write a page synchronously.
+ * Offset is the data offset within the page.
+ */
+static int
+smb_writepage_sync(struct inode *inode, struct page *page,
+		   unsigned long pageoffset, unsigned int count)
+{
+	loff_t offset;
+	char *buffer = kmap(page) + pageoffset;
+	struct smb_sb_info *server = server_from_inode(inode);
+	unsigned int wsize = smb_get_wsize(server);
+	int ret = 0;
+
+	offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
+	VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
+
+	do {
+		int write_ret;
+
+		if (count < wsize)
+			wsize = count;
+
+		write_ret = server->ops->write(inode, offset, wsize, buffer);
+		if (write_ret < 0) {
+			PARANOIA("failed write, wsize=%d, write_ret=%d\n",
+				 wsize, write_ret);
+			ret = write_ret;
+			break;
+		}
+		/* N.B. what if result < wsize?? */
+#ifdef SMBFS_PARANOIA
+		if (write_ret < wsize)
+			PARANOIA("short write, wsize=%d, write_ret=%d\n",
+				 wsize, write_ret);
+#endif
+		buffer += wsize;
+		offset += wsize;
+		count -= wsize;
+		/*
+		 * Update the inode now rather than waiting for a refresh.
+		 */
+		inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
+		SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
+		if (offset > inode->i_size)
+			inode->i_size = offset;
+	} while (count);
+
+	kunmap(page);
+	return ret;
+}
+
+/*
+ * Write a page to the server. This will be used for NFS swapping only
+ * (for now), and we currently do this synchronously only.
+ *
+ * We are called with the page locked and we unlock it when done.
+ */
+static int
+smb_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	unsigned long end_index;
+	unsigned offset = PAGE_CACHE_SIZE;
+	int err;
+
+	BUG_ON(!mapping);
+	inode = mapping->host;
+	BUG_ON(!inode);
+
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+
+	/* easy case */
+	if (page->index < end_index)
+		goto do_it;
+	/* things got complicated... */
+	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+	/* OK, are we completely out? */
+	if (page->index >= end_index+1 || !offset)
+		return 0; /* truncated - don't care */
+do_it:
+	page_cache_get(page);
+	err = smb_writepage_sync(inode, page, 0, offset);
+	SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int
+smb_updatepage(struct file *file, struct page *page, unsigned long offset,
+	       unsigned int count)
+{
+	struct dentry *dentry = file->f_path.dentry;
+
+	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
+		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
+
+	return smb_writepage_sync(dentry->d_inode, page, offset, count);
+}
+
+static ssize_t
+smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			unsigned long nr_segs, loff_t pos)
+{
+	struct file * file = iocb->ki_filp;
+	struct dentry * dentry = file->f_path.dentry;
+	ssize_t	status;
+
+	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
+
+	status = smb_revalidate_inode(dentry);
+	if (status) {
+		PARANOIA("%s/%s validation failed, error=%Zd\n",
+			 DENTRY_PATH(dentry), status);
+		goto out;
+	}
+
+	VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
+		(long)dentry->d_inode->i_size,
+		dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
+
+	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
+out:
+	return status;
+}
+
+static int
+smb_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct dentry * dentry = file->f_path.dentry;
+	int	status;
+
+	VERBOSE("file %s/%s, address %lu - %lu\n",
+		DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
+
+	status = smb_revalidate_inode(dentry);
+	if (status) {
+		PARANOIA("%s/%s validation failed, error=%d\n",
+			 DENTRY_PATH(dentry), status);
+		goto out;
+	}
+	status = generic_file_mmap(file, vma);
+out:
+	return status;
+}
+
+static ssize_t
+smb_file_splice_read(struct file *file, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t count,
+		     unsigned int flags)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	ssize_t status;
+
+	VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
+		DENTRY_PATH(dentry), *ppos, count);
+
+	status = smb_revalidate_inode(dentry);
+	if (status) {
+		PARANOIA("%s/%s validation failed, error=%Zd\n",
+			 DENTRY_PATH(dentry), status);
+		goto out;
+	}
+	status = generic_file_splice_read(file, ppos, pipe, count, flags);
+out:
+	return status;
+}
+
+/*
+ * This does the "real" work of the write. The generic routine has
+ * allocated the page, locked it, done all the page alignment stuff
+ * calculations etc. Now we should just copy the data from user
+ * space and write it back to the real medium..
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int smb_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+	return 0;
+}
+
+static int smb_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int status;
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	lock_kernel();
+	status = smb_updatepage(file, page, offset, copied);
+	unlock_kernel();
+
+	if (!status) {
+		if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+		status = copied;
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return status;
+}
+
+const struct address_space_operations smb_file_aops = {
+	.readpage = smb_readpage,
+	.writepage = smb_writepage,
+	.write_begin = smb_write_begin,
+	.write_end = smb_write_end,
+};
+
+/* 
+ * Write to a file (through the page cache).
+ */
+static ssize_t
+smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	struct file * file = iocb->ki_filp;
+	struct dentry * dentry = file->f_path.dentry;
+	ssize_t	result;
+
+	VERBOSE("file %s/%s, count=%lu@%lu\n",
+		DENTRY_PATH(dentry),
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
+
+	result = smb_revalidate_inode(dentry);
+	if (result) {
+		PARANOIA("%s/%s validation failed, error=%Zd\n",
+			 DENTRY_PATH(dentry), result);
+		goto out;
+	}
+
+	result = smb_open(dentry, SMB_O_WRONLY);
+	if (result)
+		goto out;
+
+	if (iocb->ki_left > 0) {
+		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
+			(long) file->f_pos, (long) dentry->d_inode->i_size,
+			dentry->d_inode->i_mtime.tv_sec,
+			dentry->d_inode->i_atime.tv_sec);
+	}
+out:
+	return result;
+}
+
+static int
+smb_file_open(struct inode *inode, struct file * file)
+{
+	int result;
+	struct dentry *dentry = file->f_path.dentry;
+	int smb_mode = (file->f_mode & O_ACCMODE) - 1;
+
+	lock_kernel();
+	result = smb_open(dentry, smb_mode);
+	if (result)
+		goto out;
+	SMB_I(inode)->openers++;
+out:
+	unlock_kernel();
+	return result;
+}
+
+static int
+smb_file_release(struct inode *inode, struct file * file)
+{
+	lock_kernel();
+	if (!--SMB_I(inode)->openers) {
+		/* We must flush any dirty pages now as we won't be able to
+		   write anything after close. mmap can trigger this.
+		   "openers" should perhaps include mmap'ers ... */
+		filemap_write_and_wait(inode->i_mapping);
+		smb_close(inode);
+	}
+	unlock_kernel();
+	return 0;
+}
+
+/*
+ * Check whether the required access is compatible with
+ * an inode's permission. SMB doesn't recognize superuser
+ * privileges, so we need our own check for this.
+ */
+static int
+smb_file_permission(struct inode *inode, int mask)
+{
+	int mode = inode->i_mode;
+	int error = 0;
+
+	VERBOSE("mode=%x, mask=%x\n", mode, mask);
+
+	/* Look at user permissions */
+	mode >>= 6;
+	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
+		error = -EACCES;
+	return error;
+}
+
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
+const struct file_operations smb_file_operations =
+{
+	.llseek 	= smb_remote_llseek,
+	.read		= do_sync_read,
+	.aio_read	= smb_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= smb_file_aio_write,
+	.unlocked_ioctl	= smb_ioctl,
+	.mmap		= smb_file_mmap,
+	.open		= smb_file_open,
+	.release	= smb_file_release,
+	.fsync		= smb_fsync,
+	.splice_read	= smb_file_splice_read,
+};
+
+const struct inode_operations smb_file_inode_operations =
+{
+	.permission	= smb_file_permission,
+	.getattr	= smb_getattr,
+	.setattr	= smb_notify_change,
+};
diff --git a/drivers/staging/smbfs/getopt.c b/drivers/staging/smbfs/getopt.c
new file mode 100644
index 000000000000..7ae0f5273ab1
--- /dev/null
+++ b/drivers/staging/smbfs/getopt.c
@@ -0,0 +1,64 @@
+/*
+ * getopt.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/net.h>
+
+#include "getopt.h"
+
+/**
+ *	smb_getopt - option parser
+ *	@caller: name of the caller, for error messages
+ *	@options: the options string
+ *	@opts: an array of &struct option entries controlling parser operations
+ *	@optopt: output; will contain the current option
+ *	@optarg: output; will contain the value (if one exists)
+ *	@flag: output; may be NULL; should point to a long for or'ing flags
+ *	@value: output; may be NULL; will be overwritten with the integer value
+ *		of the current argument.
+ *
+ *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
+ *	Returns opts->val if a matching entry in the 'opts' array is found,
+ *	0 when no more tokens are found, -1 if an error is encountered.
+ */
+int smb_getopt(char *caller, char **options, struct option *opts,
+	       char **optopt, char **optarg, unsigned long *flag,
+	       unsigned long *value)
+{
+	char *token;
+	char *val;
+	int i;
+
+	do {
+		if ((token = strsep(options, ",")) == NULL)
+			return 0;
+	} while (*token == '\0');
+	*optopt = token;
+
+	*optarg = NULL;
+	if ((val = strchr (token, '=')) != NULL) {
+		*val++ = 0;
+		if (value)
+			*value = simple_strtoul(val, NULL, 0);
+		*optarg = val;
+	}
+
+	for (i = 0; opts[i].name != NULL; i++) {
+		if (!strcmp(opts[i].name, token)) {
+			if (!opts[i].flag && (!val || !*val)) {
+				printk("%s: the %s option requires an argument\n",
+				       caller, token);
+				return -1;
+			}
+
+			if (flag && opts[i].flag)
+				*flag |= opts[i].flag;
+
+			return opts[i].val;
+		}
+	}
+	printk("%s: Unrecognized mount option %s\n", caller, token);
+	return -1;
+}
diff --git a/drivers/staging/smbfs/getopt.h b/drivers/staging/smbfs/getopt.h
new file mode 100644
index 000000000000..146219ac7c46
--- /dev/null
+++ b/drivers/staging/smbfs/getopt.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_GETOPT_H
+#define _LINUX_GETOPT_H
+
+struct option {
+	const char *name;
+	unsigned long flag;
+	int val;
+};
+
+extern int smb_getopt(char *caller, char **options, struct option *opts,
+		      char **optopt, char **optarg, unsigned long *flag,
+		      unsigned long *value);
+
+#endif /* _LINUX_GETOPT_H */
diff --git a/drivers/staging/smbfs/inode.c b/drivers/staging/smbfs/inode.c
new file mode 100644
index 000000000000..9287599ddb50
--- /dev/null
+++ b/drivers/staging/smbfs/inode.c
@@ -0,0 +1,839 @@
+/*
+ *  inode.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/smp_lock.h>
+#include <linux/nls.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/net.h>
+#include <linux/vfs.h>
+#include <linux/highuid.h>
+#include <linux/sched.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "getopt.h"
+#include "proto.h"
+
+/* Always pick a default string */
+#ifdef CONFIG_SMB_NLS_REMOTE
+#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
+#else
+#define SMB_NLS_REMOTE ""
+#endif
+
+#define SMB_TTL_DEFAULT 1000
+
+static void smb_evict_inode(struct inode *);
+static void smb_put_super(struct super_block *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
+static int  smb_show_options(struct seq_file *, struct vfsmount *);
+
+static struct kmem_cache *smb_inode_cachep;
+
+static struct inode *smb_alloc_inode(struct super_block *sb)
+{
+	struct smb_inode_info *ei;
+	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void smb_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
+}
+
+static void init_once(void *foo)
+{
+	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
+					     sizeof(struct smb_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once);
+	if (smb_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(smb_inode_cachep);
+}
+
+static int smb_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+static const struct super_operations smb_sops =
+{
+	.alloc_inode	= smb_alloc_inode,
+	.destroy_inode	= smb_destroy_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= smb_evict_inode,
+	.put_super	= smb_put_super,
+	.statfs		= smb_statfs,
+	.show_options	= smb_show_options,
+	.remount_fs	= smb_remount,
+};
+
+
+/* We are always generating a new inode here */
+struct inode *
+smb_iget(struct super_block *sb, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = SMB_SB(sb);
+	struct inode *result;
+
+	DEBUG1("smb_iget: %p\n", fattr);
+
+	result = new_inode(sb);
+	if (!result)
+		return result;
+	result->i_ino = fattr->f_ino;
+	SMB_I(result)->open = 0;
+	SMB_I(result)->fileid = 0;
+	SMB_I(result)->access = 0;
+	SMB_I(result)->flags = 0;
+	SMB_I(result)->closed = 0;
+	SMB_I(result)->openers = 0;
+	smb_set_inode_attr(result, fattr);
+	if (S_ISREG(result->i_mode)) {
+		result->i_op = &smb_file_inode_operations;
+		result->i_fop = &smb_file_operations;
+		result->i_data.a_ops = &smb_file_aops;
+	} else if (S_ISDIR(result->i_mode)) {
+		if (server->opt.capabilities & SMB_CAP_UNIX)
+			result->i_op = &smb_dir_inode_operations_unix;
+		else
+			result->i_op = &smb_dir_inode_operations;
+		result->i_fop = &smb_dir_operations;
+	} else if (S_ISLNK(result->i_mode)) {
+		result->i_op = &smb_link_inode_operations;
+	} else {
+		init_special_inode(result, result->i_mode, fattr->f_rdev);
+	}
+	insert_inode_hash(result);
+	return result;
+}
+
+/*
+ * Copy the inode data to a smb_fattr structure.
+ */
+void
+smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
+{
+	memset(fattr, 0, sizeof(struct smb_fattr));
+	fattr->f_mode	= inode->i_mode;
+	fattr->f_nlink	= inode->i_nlink;
+	fattr->f_ino	= inode->i_ino;
+	fattr->f_uid	= inode->i_uid;
+	fattr->f_gid	= inode->i_gid;
+	fattr->f_size	= inode->i_size;
+	fattr->f_mtime	= inode->i_mtime;
+	fattr->f_ctime	= inode->i_ctime;
+	fattr->f_atime	= inode->i_atime;
+	fattr->f_blocks	= inode->i_blocks;
+
+	fattr->attr	= SMB_I(inode)->attr;
+	/*
+	 * Keep the attributes in sync with the inode permissions.
+	 */
+	if (fattr->f_mode & S_IWUSR)
+		fattr->attr &= ~aRONLY;
+	else
+		fattr->attr |= aRONLY;
+}
+
+/*
+ * Update the inode, possibly causing it to invalidate its pages if mtime/size
+ * is different from last time.
+ */
+void
+smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
+{
+	struct smb_inode_info *ei = SMB_I(inode);
+
+	/*
+	 * A size change should have a different mtime, or same mtime
+	 * but different size.
+	 */
+	time_t last_time = inode->i_mtime.tv_sec;
+	loff_t last_sz = inode->i_size;
+
+	inode->i_mode	= fattr->f_mode;
+	inode->i_nlink	= fattr->f_nlink;
+	inode->i_uid	= fattr->f_uid;
+	inode->i_gid	= fattr->f_gid;
+	inode->i_ctime	= fattr->f_ctime;
+	inode->i_blocks = fattr->f_blocks;
+	inode->i_size	= fattr->f_size;
+	inode->i_mtime	= fattr->f_mtime;
+	inode->i_atime	= fattr->f_atime;
+	ei->attr = fattr->attr;
+
+	/*
+	 * Update the "last time refreshed" field for revalidation.
+	 */
+	ei->oldmtime = jiffies;
+
+	if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
+		VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
+			inode->i_ino,
+			(long) last_time, (long) inode->i_mtime.tv_sec,
+			(long) last_sz, (long) inode->i_size);
+
+		if (!S_ISDIR(inode->i_mode))
+			invalidate_remote_inode(inode);
+	}
+}
+
+/*
+ * This is called if the connection has gone bad ...
+ * try to kill off all the current inodes.
+ */
+void
+smb_invalidate_inodes(struct smb_sb_info *server)
+{
+	VERBOSE("\n");
+	shrink_dcache_sb(SB_of(server));
+	invalidate_inodes(SB_of(server));
+}
+
+/*
+ * This is called to update the inode attributes after
+ * we've made changes to a file or directory.
+ */
+static int
+smb_refresh_inode(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+	struct smb_fattr fattr;
+
+	error = smb_proc_getattr(dentry, &fattr);
+	if (!error) {
+		smb_renew_times(dentry);
+		/*
+		 * Check whether the type part of the mode changed,
+		 * and don't update the attributes if it did.
+		 *
+		 * And don't dick with the root inode
+		 */
+		if (inode->i_ino == 2)
+			return error;
+		if (S_ISLNK(inode->i_mode))
+			return error;	/* VFS will deal with it */
+
+		if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
+			smb_set_inode_attr(inode, &fattr);
+		} else {
+			/*
+			 * Big trouble! The inode has become a new object,
+			 * so any operations attempted on it are invalid.
+			 *
+			 * To limit damage, mark the inode as bad so that
+			 * subsequent lookup validations will fail.
+			 */
+			PARANOIA("%s/%s changed mode, %07o to %07o\n",
+				 DENTRY_PATH(dentry),
+				 inode->i_mode, fattr.f_mode);
+
+			fattr.f_mode = inode->i_mode; /* save mode */
+			make_bad_inode(inode);
+			inode->i_mode = fattr.f_mode; /* restore mode */
+			/*
+			 * No need to worry about unhashing the dentry: the
+			 * lookup validation will see that the inode is bad.
+			 * But we do want to invalidate the caches ...
+			 */
+			if (!S_ISDIR(inode->i_mode))
+				invalidate_remote_inode(inode);
+			else
+				smb_invalid_dir_cache(inode);
+			error = -EIO;
+		}
+	}
+	return error;
+}
+
+/*
+ * This is called when we want to check whether the inode
+ * has changed on the server.  If it has changed, we must
+ * invalidate our local caches.
+ */
+int
+smb_revalidate_inode(struct dentry *dentry)
+{
+	struct smb_sb_info *s = server_from_dentry(dentry);
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+
+	DEBUG1("smb_revalidate_inode\n");
+	lock_kernel();
+
+	/*
+	 * Check whether we've recently refreshed the inode.
+	 */
+	if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
+		VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
+			inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
+		goto out;
+	}
+
+	error = smb_refresh_inode(dentry);
+out:
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * This routine is called when i_nlink == 0 and i_count goes to 0.
+ * All blocking cleanup operations need to go here to avoid races.
+ */
+static void
+smb_evict_inode(struct inode *ino)
+{
+	DEBUG1("ino=%ld\n", ino->i_ino);
+	truncate_inode_pages(&ino->i_data, 0);
+	end_writeback(ino);
+	lock_kernel();
+	if (smb_close(ino))
+		PARANOIA("could not close inode %ld\n", ino->i_ino);
+	unlock_kernel();
+}
+
+static struct option opts[] = {
+	{ "version",	0, 'v' },
+	{ "win95",	SMB_MOUNT_WIN95, 1 },
+	{ "oldattr",	SMB_MOUNT_OLDATTR, 1 },
+	{ "dirattr",	SMB_MOUNT_DIRATTR, 1 },
+	{ "case",	SMB_MOUNT_CASE, 1 },
+	{ "uid",	0, 'u' },
+	{ "gid",	0, 'g' },
+	{ "file_mode",	0, 'f' },
+	{ "dir_mode",	0, 'd' },
+	{ "iocharset",	0, 'i' },
+	{ "codepage",	0, 'c' },
+	{ "ttl",	0, 't' },
+	{ NULL,		0, 0}
+};
+
+static int
+parse_options(struct smb_mount_data_kernel *mnt, char *options)
+{
+	int c;
+	unsigned long flags;
+	unsigned long value;
+	char *optarg;
+	char *optopt;
+
+	flags = 0;
+	while ( (c = smb_getopt("smbfs", &options, opts,
+				&optopt, &optarg, &flags, &value)) > 0) {
+
+		VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
+		switch (c) {
+		case 1:
+			/* got a "flag" option */
+			break;
+		case 'v':
+			if (value != SMB_MOUNT_VERSION) {
+			printk ("smbfs: Bad mount version %ld, expected %d\n",
+				value, SMB_MOUNT_VERSION);
+				return 0;
+			}
+			mnt->version = value;
+			break;
+		case 'u':
+			mnt->uid = value;
+			flags |= SMB_MOUNT_UID;
+			break;
+		case 'g':
+			mnt->gid = value;
+			flags |= SMB_MOUNT_GID;
+			break;
+		case 'f':
+			mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
+			flags |= SMB_MOUNT_FMODE;
+			break;
+		case 'd':
+			mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
+			flags |= SMB_MOUNT_DMODE;
+			break;
+		case 'i':
+			strlcpy(mnt->codepage.local_name, optarg, 
+				SMB_NLS_MAXNAMELEN);
+			break;
+		case 'c':
+			strlcpy(mnt->codepage.remote_name, optarg,
+				SMB_NLS_MAXNAMELEN);
+			break;
+		case 't':
+			mnt->ttl = value;
+			break;
+		default:
+			printk ("smbfs: Unrecognized mount option %s\n",
+				optopt);
+			return -1;
+		}
+	}
+	mnt->flags = flags;
+	return c;
+}
+
+/*
+ * smb_show_options() is for displaying mount options in /proc/mounts.
+ * It tries to avoid showing settings that were not changed from their
+ * defaults.
+ */
+static int
+smb_show_options(struct seq_file *s, struct vfsmount *m)
+{
+	struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
+	int i;
+
+	for (i = 0; opts[i].name != NULL; i++)
+		if (mnt->flags & opts[i].flag)
+			seq_printf(s, ",%s", opts[i].name);
+
+	if (mnt->flags & SMB_MOUNT_UID)
+		seq_printf(s, ",uid=%d", mnt->uid);
+	if (mnt->flags & SMB_MOUNT_GID)
+		seq_printf(s, ",gid=%d", mnt->gid);
+	if (mnt->mounted_uid != 0)
+		seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
+
+	/* 
+	 * Defaults for file_mode and dir_mode are unknown to us; they
+	 * depend on the current umask of the user doing the mount.
+	 */
+	if (mnt->flags & SMB_MOUNT_FMODE)
+		seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
+	if (mnt->flags & SMB_MOUNT_DMODE)
+		seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
+
+	if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
+		seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
+	if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
+		seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
+
+	if (mnt->ttl != SMB_TTL_DEFAULT)
+		seq_printf(s, ",ttl=%d", mnt->ttl);
+
+	return 0;
+}
+
+static void
+smb_unload_nls(struct smb_sb_info *server)
+{
+	unload_nls(server->remote_nls);
+	unload_nls(server->local_nls);
+}
+
+static void
+smb_put_super(struct super_block *sb)
+{
+	struct smb_sb_info *server = SMB_SB(sb);
+
+	lock_kernel();
+
+	smb_lock_server(server);
+	server->state = CONN_INVALID;
+	smbiod_unregister_server(server);
+
+	smb_close_socket(server);
+
+	if (server->conn_pid)
+		kill_pid(server->conn_pid, SIGTERM, 1);
+
+	bdi_destroy(&server->bdi);
+	kfree(server->ops);
+	smb_unload_nls(server);
+	sb->s_fs_info = NULL;
+	smb_unlock_server(server);
+	put_pid(server->conn_pid);
+	kfree(server);
+
+	unlock_kernel();
+}
+
+static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+	struct smb_sb_info *server;
+	struct smb_mount_data_kernel *mnt;
+	struct smb_mount_data *oldmnt;
+	struct inode *root_inode;
+	struct smb_fattr root;
+	int ver;
+	void *mem;
+	static int warn_count;
+
+	if (warn_count < 5) {
+		warn_count++;
+		printk(KERN_EMERG "smbfs is deprecated and will be removed"
+			" from the 2.6.37 kernel. Please migrate to cifs\n");
+	}
+
+	if (!raw_data)
+		goto out_no_data;
+
+	oldmnt = (struct smb_mount_data *) raw_data;
+	ver = oldmnt->version;
+	if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
+		goto out_wrong_data;
+
+	sb->s_flags |= MS_NODIRATIME;
+	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = SMB_SUPER_MAGIC;
+	sb->s_op = &smb_sops;
+	sb->s_time_gran = 100;
+
+	server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
+	if (!server)
+		goto out_no_server;
+	sb->s_fs_info = server;
+	
+	if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
+		goto out_bdi;
+
+	sb->s_bdi = &server->bdi;
+
+	server->super_block = sb;
+	server->mnt = NULL;
+	server->sock_file = NULL;
+	init_waitqueue_head(&server->conn_wq);
+	init_MUTEX(&server->sem);
+	INIT_LIST_HEAD(&server->entry);
+	INIT_LIST_HEAD(&server->xmitq);
+	INIT_LIST_HEAD(&server->recvq);
+	server->conn_error = 0;
+	server->conn_pid = NULL;
+	server->state = CONN_INVALID; /* no connection yet */
+	server->generation = 0;
+
+	/* Allocate the global temp buffer and some superblock helper structs */
+	/* FIXME: move these to the smb_sb_info struct */
+	VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
+		sizeof(struct smb_mount_data_kernel));
+	mem = kmalloc(sizeof(struct smb_ops) +
+		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
+	if (!mem)
+		goto out_no_mem;
+
+	server->ops = mem;
+	smb_install_null_ops(server->ops);
+	server->mnt = mem + sizeof(struct smb_ops);
+
+	/* Setup NLS stuff */
+	server->remote_nls = NULL;
+	server->local_nls = NULL;
+
+	mnt = server->mnt;
+
+	memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
+	strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
+		SMB_NLS_MAXNAMELEN);
+	strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
+		SMB_NLS_MAXNAMELEN);
+
+	mnt->ttl = SMB_TTL_DEFAULT;
+	if (ver == SMB_MOUNT_OLDVERSION) {
+		mnt->version = oldmnt->version;
+
+		SET_UID(mnt->uid, oldmnt->uid);
+		SET_GID(mnt->gid, oldmnt->gid);
+
+		mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
+		mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
+
+		mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
+			SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
+	} else {
+		mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
+				S_IROTH | S_IXOTH | S_IFREG;
+		mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
+				S_IROTH | S_IXOTH | S_IFDIR;
+		if (parse_options(mnt, raw_data))
+			goto out_bad_option;
+	}
+	mnt->mounted_uid = current_uid();
+	smb_setcodepage(server, &mnt->codepage);
+
+	/*
+	 * Display the enabled options
+	 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
+	 */
+	if (mnt->flags & SMB_MOUNT_OLDATTR)
+		printk("SMBFS: Using core getattr (Win 95 speedup)\n");
+	else if (mnt->flags & SMB_MOUNT_DIRATTR)
+		printk("SMBFS: Using dir ff getattr\n");
+
+	if (smbiod_register_server(server) < 0) {
+		printk(KERN_ERR "smbfs: failed to start smbiod\n");
+		goto out_no_smbiod;
+	}
+
+	/*
+	 * Keep the super block locked while we get the root inode.
+	 */
+	smb_init_root_dirent(server, &root, sb);
+	root_inode = smb_iget(sb, &root);
+	if (!root_inode)
+		goto out_no_root;
+
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_no_root;
+
+	smb_new_dentry(sb->s_root);
+
+	return 0;
+
+out_no_root:
+	iput(root_inode);
+out_no_smbiod:
+	smb_unload_nls(server);
+out_bad_option:
+	kfree(mem);
+out_no_mem:
+	bdi_destroy(&server->bdi);
+out_bdi:
+	if (!server->mnt)
+		printk(KERN_ERR "smb_fill_super: allocation failure\n");
+	sb->s_fs_info = NULL;
+	kfree(server);
+	goto out_fail;
+out_wrong_data:
+	printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
+	goto out_fail;
+out_no_data:
+	printk(KERN_ERR "smb_fill_super: missing data argument\n");
+out_fail:
+	return -EINVAL;
+out_no_server:
+	printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
+	return -ENOMEM;
+}
+
+static int
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int result;
+	
+	lock_kernel();
+
+	result = smb_proc_dskattr(dentry, buf);
+
+	unlock_kernel();
+
+	buf->f_type = SMB_SUPER_MAGIC;
+	buf->f_namelen = SMB_MAXPATHLEN;
+	return result;
+}
+
+int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	int err = smb_revalidate_inode(dentry);
+	if (!err)
+		generic_fillattr(dentry->d_inode, stat);
+	return err;
+}
+
+int
+smb_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
+	int error, changed, refresh = 0;
+	struct smb_fattr fattr;
+
+	lock_kernel();
+
+	error = smb_revalidate_inode(dentry);
+	if (error)
+		goto out;
+
+	if ((error = inode_change_ok(inode, attr)) < 0)
+		goto out;
+
+	error = -EPERM;
+	if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
+		goto out;
+
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
+			DENTRY_PATH(dentry),
+			(long) inode->i_size, (long) attr->ia_size);
+
+		filemap_write_and_wait(inode->i_mapping);
+
+		error = smb_open(dentry, O_WRONLY);
+		if (error)
+			goto out;
+		error = server->ops->truncate(inode, attr->ia_size);
+		if (error)
+			goto out;
+		truncate_setsize(inode, attr->ia_size);
+		refresh = 1;
+	}
+
+	if (server->opt.capabilities & SMB_CAP_UNIX) {
+		/* For now we don't want to set the size with setattr_unix */
+		attr->ia_valid &= ~ATTR_SIZE;
+		/* FIXME: only call if we actually want to set something? */
+		error = smb_proc_setattr_unix(dentry, attr, 0, 0);
+		if (!error)
+			refresh = 1;
+
+		goto out;
+	}
+
+	/*
+	 * Initialize the fattr and check for changed fields.
+	 * Note: CTIME under SMB is creation time rather than
+	 * change time, so we don't attempt to change it.
+	 */
+	smb_get_inode_attr(inode, &fattr);
+
+	changed = 0;
+	if ((attr->ia_valid & ATTR_MTIME) != 0) {
+		fattr.f_mtime = attr->ia_mtime;
+		changed = 1;
+	}
+	if ((attr->ia_valid & ATTR_ATIME) != 0) {
+		fattr.f_atime = attr->ia_atime;
+		/* Earlier protocols don't have an access time */
+		if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
+			changed = 1;
+	}
+	if (changed) {
+		error = smb_proc_settime(dentry, &fattr);
+		if (error)
+			goto out;
+		refresh = 1;
+	}
+
+	/*
+	 * Check for mode changes ... we're extremely limited in
+	 * what can be set for SMB servers: just the read-only bit.
+	 */
+	if ((attr->ia_valid & ATTR_MODE) != 0) {
+		VERBOSE("%s/%s mode change, old=%x, new=%x\n",
+			DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
+		changed = 0;
+		if (attr->ia_mode & S_IWUSR) {
+			if (fattr.attr & aRONLY) {
+				fattr.attr &= ~aRONLY;
+				changed = 1;
+			}
+		} else {
+			if (!(fattr.attr & aRONLY)) {
+				fattr.attr |= aRONLY;
+				changed = 1;
+			}
+		}
+		if (changed) {
+			error = smb_proc_setattr(dentry, &fattr);
+			if (error)
+				goto out;
+			refresh = 1;
+		}
+	}
+	error = 0;
+
+out:
+	if (refresh)
+		smb_refresh_inode(dentry);
+	unlock_kernel();
+	return error;
+}
+
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
+}
+
+static struct file_system_type smb_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "smbfs",
+	.get_sb		= smb_get_sb,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_BINARY_MOUNTDATA,
+};
+
+static int __init init_smb_fs(void)
+{
+	int err;
+	DEBUG1("registering ...\n");
+
+	err = init_inodecache();
+	if (err)
+		goto out_inode;
+	err = smb_init_request_cache();
+	if (err)
+		goto out_request;
+	err = register_filesystem(&smb_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	smb_destroy_request_cache();
+out_request:
+	destroy_inodecache();
+out_inode:
+	return err;
+}
+
+static void __exit exit_smb_fs(void)
+{
+	DEBUG1("unregistering ...\n");
+	unregister_filesystem(&smb_fs_type);
+	smb_destroy_request_cache();
+	destroy_inodecache();
+}
+
+module_init(init_smb_fs)
+module_exit(exit_smb_fs)
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/smbfs/ioctl.c b/drivers/staging/smbfs/ioctl.c
new file mode 100644
index 000000000000..2da169267470
--- /dev/null
+++ b/drivers/staging/smbfs/ioctl.c
@@ -0,0 +1,68 @@
+/*
+ *  ioctl.c
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/highuid.h>
+#include <linux/smp_lock.h>
+#include <linux/net.h>
+
+#include <asm/uaccess.h>
+
+#include "smb_fs.h"
+#include "smb_mount.h"
+#include "proto.h"
+
+long
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
+	struct smb_conn_opt opt;
+	int result = -EINVAL;
+
+	lock_kernel();
+	switch (cmd) {
+		uid16_t uid16;
+		uid_t uid32;
+	case SMB_IOC_GETMOUNTUID:
+		SET_UID(uid16, server->mnt->mounted_uid);
+		result = put_user(uid16, (uid16_t __user *) arg);
+		break;
+	case SMB_IOC_GETMOUNTUID32:
+		SET_UID(uid32, server->mnt->mounted_uid);
+		result = put_user(uid32, (uid_t __user *) arg);
+		break;
+
+	case SMB_IOC_NEWCONN:
+		/* arg is smb_conn_opt, or NULL if no connection was made */
+		if (!arg) {
+			result = 0;
+			smb_lock_server(server);
+			server->state = CONN_RETRIED;
+			printk(KERN_ERR "Connection attempt failed!  [%d]\n",
+			       server->conn_error);
+			smbiod_flush(server);
+			smb_unlock_server(server);
+			break;
+		}
+
+		result = -EFAULT;
+		if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
+			result = smb_newconn(server, &opt);
+		break;
+	default:
+		break;
+	}
+	unlock_kernel();
+
+	return result;
+}
diff --git a/drivers/staging/smbfs/proc.c b/drivers/staging/smbfs/proc.c
new file mode 100644
index 000000000000..2fb079c37f25
--- /dev/null
+++ b/drivers/staging/smbfs/proc.c
@@ -0,0 +1,3506 @@
+/*
+ *  proc.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/dcache.h>
+#include <linux/nls.h>
+#include <linux/smp_lock.h>
+#include <linux/net.h>
+#include <linux/vfs.h>
+#include <net/sock.h>
+
+#include <asm/string.h>
+#include <asm/div64.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "proto.h"
+#include "request.h"
+
+
+/* Features. Undefine if they cause problems, this should perhaps be a
+   config option. */
+#define SMBFS_POSIX_UNLINK 1
+
+/* Allow smb_retry to be interrupted. */
+#define SMB_RETRY_INTR
+
+#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
+#define SMB_CMD(packet)  (*(packet+8))
+#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
+
+#define SMB_DIRINFO_SIZE 43
+#define SMB_STATUS_SIZE  21
+
+#define SMB_ST_BLKSIZE	(PAGE_SIZE)
+#define SMB_ST_BLKSHIFT	(PAGE_SHIFT)
+
+static struct smb_ops smb_ops_core;
+static struct smb_ops smb_ops_os2;
+static struct smb_ops smb_ops_win95;
+static struct smb_ops smb_ops_winNT;
+static struct smb_ops smb_ops_unix;
+static struct smb_ops smb_ops_null;
+
+static void
+smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
+static void
+smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
+static int
+smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
+		      struct smb_fattr *fattr);
+static int
+smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
+		    struct smb_fattr *fattr);
+static int
+smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
+		      u16 attr);
+static int
+smb_proc_setattr_ext(struct smb_sb_info *server,
+		     struct inode *inode, struct smb_fattr *fattr);
+static int
+smb_proc_query_cifsunix(struct smb_sb_info *server);
+static void
+install_ops(struct smb_ops *dst, struct smb_ops *src);
+
+
+static void
+str_upper(char *name, int len)
+{
+	while (len--)
+	{
+		if (*name >= 'a' && *name <= 'z')
+			*name -= ('a' - 'A');
+		name++;
+	}
+}
+
+#if 0
+static void
+str_lower(char *name, int len)
+{
+	while (len--)
+	{
+		if (*name >= 'A' && *name <= 'Z')
+			*name += ('a' - 'A');
+		name++;
+	}
+}
+#endif
+
+/* reverse a string inline. This is used by the dircache walking routines */
+static void reverse_string(char *buf, int len)
+{
+	char c;
+	char *end = buf+len-1;
+
+	while(buf < end) {
+		c = *buf;
+		*(buf++) = *end;
+		*(end--) = c;
+	}
+}
+
+/* no conversion, just a wrapper for memcpy. */
+static int convert_memcpy(unsigned char *output, int olen,
+			  const unsigned char *input, int ilen,
+			  struct nls_table *nls_from,
+			  struct nls_table *nls_to)
+{
+	if (olen < ilen)
+		return -ENAMETOOLONG;
+	memcpy(output, input, ilen);
+	return ilen;
+}
+
+static inline int write_char(unsigned char ch, char *output, int olen)
+{
+	if (olen < 4)
+		return -ENAMETOOLONG;
+	sprintf(output, ":x%02x", ch);
+	return 4;
+}
+
+static inline int write_unichar(wchar_t ch, char *output, int olen)
+{
+	if (olen < 5)
+		return -ENAMETOOLONG;
+	sprintf(output, ":%04x", ch);
+	return 5;
+}
+
+/* convert from one "codepage" to another (possibly being utf8). */
+static int convert_cp(unsigned char *output, int olen,
+		      const unsigned char *input, int ilen,
+		      struct nls_table *nls_from,
+		      struct nls_table *nls_to)
+{
+	int len = 0;
+	int n;
+	wchar_t ch;
+
+	while (ilen > 0) {
+		/* convert by changing to unicode and back to the new cp */
+		n = nls_from->char2uni(input, ilen, &ch);
+		if (n == -EINVAL) {
+			ilen--;
+			n = write_char(*input++, output, olen);
+			if (n < 0)
+				goto fail;
+			output += n;
+			olen -= n;
+			len += n;
+			continue;
+		} else if (n < 0)
+			goto fail;
+		input += n;
+		ilen -= n;
+
+		n = nls_to->uni2char(ch, output, olen);
+		if (n == -EINVAL)
+			n = write_unichar(ch, output, olen);
+		if (n < 0)
+			goto fail;
+		output += n;
+		olen -= n;
+
+		len += n;
+	}
+	return len;
+fail:
+	return n;
+}
+
+/* ----------------------------------------------------------- */
+
+/*
+ * nls_unicode
+ *
+ * This encodes/decodes little endian unicode format
+ */
+
+static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	if (boundlen < 2)
+		return -EINVAL;
+	*out++ = uni & 0xff;
+	*out++ = uni >> 8;
+	return 2;
+}
+
+static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
+{
+	if (boundlen < 2)
+		return -EINVAL;
+	*uni = (rawstring[1] << 8) | rawstring[0];
+	return 2;
+}
+
+static struct nls_table unicode_table = {
+	.charset	= "unicode",
+	.uni2char	= uni2char,
+	.char2uni	= char2uni,
+};
+
+/* ----------------------------------------------------------- */
+
+static int setcodepage(struct nls_table **p, char *name)
+{
+	struct nls_table *nls;
+
+	if (!name || !*name) {
+		nls = NULL;
+	} else if ( (nls = load_nls(name)) == NULL) {
+		printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
+		return -EINVAL;
+	}
+
+	/* if already set, unload the previous one. */
+	if (*p && *p != &unicode_table)
+		unload_nls(*p);
+	*p = nls;
+
+	return 0;
+}
+
+/* Handles all changes to codepage settings. */
+int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
+{
+	int n = 0;
+
+	smb_lock_server(server);
+
+	/* Don't load any nls_* at all, if no remote is requested */
+	if (!*cp->remote_name)
+		goto out;
+
+	/* local */
+	n = setcodepage(&server->local_nls, cp->local_name);
+	if (n != 0)
+		goto out;
+
+	/* remote */
+	if (!strcmp(cp->remote_name, "unicode")) {
+		server->remote_nls = &unicode_table;
+	} else {
+		n = setcodepage(&server->remote_nls, cp->remote_name);
+		if (n != 0)
+			setcodepage(&server->local_nls, NULL);
+	}
+
+out:
+	if (server->local_nls != NULL && server->remote_nls != NULL)
+		server->ops->convert = convert_cp;
+	else
+		server->ops->convert = convert_memcpy;
+
+	smb_unlock_server(server);
+	return n;
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Encoding/Decoding section                                                */
+/*                                                                           */
+/*****************************************************************************/
+
+static __u8 *
+smb_encode_smb_length(__u8 * p, __u32 len)
+{
+	*p = 0;
+	*(p+1) = 0;
+	*(p+2) = (len & 0xFF00) >> 8;
+	*(p+3) = (len & 0xFF);
+	if (len > 0xFFFF)
+	{
+		*(p+1) = 1;
+	}
+	return p + 4;
+}
+
+/*
+ * smb_build_path: build the path to entry and name storing it in buf.
+ * The path returned will have the trailing '\0'.
+ */
+static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
+			  int maxlen,
+			  struct dentry *entry, struct qstr *name)
+{
+	unsigned char *path = buf;
+	int len;
+	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
+
+	if (maxlen < (2<<unicode))
+		return -ENAMETOOLONG;
+
+	if (maxlen > SMB_MAXPATHLEN + 1)
+		maxlen = SMB_MAXPATHLEN + 1;
+
+	if (entry == NULL)
+		goto test_name_and_out;
+
+	/*
+	 * If IS_ROOT, we have to do no walking at all.
+	 */
+	if (IS_ROOT(entry) && !name) {
+		*path++ = '\\';
+		if (unicode) *path++ = '\0';
+		*path++ = '\0';
+		if (unicode) *path++ = '\0';
+		return path-buf;
+	}
+
+	/*
+	 * Build the path string walking the tree backward from end to ROOT
+	 * and store it in reversed order [see reverse_string()]
+	 */
+	dget(entry);
+	spin_lock(&entry->d_lock);
+	while (!IS_ROOT(entry)) {
+		struct dentry *parent;
+
+		if (maxlen < (3<<unicode)) {
+			spin_unlock(&entry->d_lock);
+			dput(entry);
+			return -ENAMETOOLONG;
+		}
+
+		len = server->ops->convert(path, maxlen-2, 
+				      entry->d_name.name, entry->d_name.len,
+				      server->local_nls, server->remote_nls);
+		if (len < 0) {
+			spin_unlock(&entry->d_lock);
+			dput(entry);
+			return len;
+		}
+		reverse_string(path, len);
+		path += len;
+		if (unicode) {
+			/* Note: reverse order */
+			*path++ = '\0';
+			maxlen--;
+		}
+		*path++ = '\\';
+		maxlen -= len+1;
+
+		parent = entry->d_parent;
+		dget(parent);
+		spin_unlock(&entry->d_lock);
+		dput(entry);
+		entry = parent;
+		spin_lock(&entry->d_lock);
+	}
+	spin_unlock(&entry->d_lock);
+	dput(entry);
+	reverse_string(buf, path-buf);
+
+	/* maxlen has space for at least one char */
+test_name_and_out:
+	if (name) {
+		if (maxlen < (3<<unicode))
+			return -ENAMETOOLONG;
+		*path++ = '\\';
+		if (unicode) {
+			*path++ = '\0';
+			maxlen--;
+		}
+		len = server->ops->convert(path, maxlen-2, 
+				      name->name, name->len,
+				      server->local_nls, server->remote_nls);
+		if (len < 0)
+			return len;
+		path += len;
+		maxlen -= len+1;
+	}
+	/* maxlen has space for at least one char */
+	*path++ = '\0';
+	if (unicode) *path++ = '\0';
+	return path-buf;
+}
+
+static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
+			   struct dentry *dir, struct qstr *name)
+{
+	int result;
+
+	result = smb_build_path(server, buf, maxlen, dir, name);
+	if (result < 0)
+		goto out;
+	if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
+		str_upper(buf, result);
+out:
+	return result;
+}
+
+/* encode_path for non-trans2 request SMBs */
+static int smb_simple_encode_path(struct smb_request *req, char **p,
+				  struct dentry * entry, struct qstr * name)
+{
+	struct smb_sb_info *server = req->rq_server;
+	char *s = *p;
+	int res;
+	int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
+	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
+
+	if (!maxlen)
+		return -ENAMETOOLONG;
+	*s++ = 4;	/* ASCII data format */
+
+	/*
+	 * SMB Unicode strings must be 16bit aligned relative the start of the
+	 * packet. If they are not they must be padded with 0.
+	 */
+	if (unicode) {
+		int align = s - (char *)req->rq_buffer;
+		if (!(align & 1)) {
+			*s++ = '\0';
+			maxlen--;
+		}
+	}
+
+	res = smb_encode_path(server, s, maxlen-1, entry, name);
+	if (res < 0)
+		return res;
+	*p = s + res;
+	return 0;
+}
+
+/* The following are taken directly from msdos-fs */
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+
+static int day_n[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
+		  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
+
+
+static time_t
+utc2local(struct smb_sb_info *server, time_t time)
+{
+	return time - server->opt.serverzone*60;
+}
+
+static time_t
+local2utc(struct smb_sb_info *server, time_t time)
+{
+	return time + server->opt.serverzone*60;
+}
+
+/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
+
+static time_t
+date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
+{
+	int month, year;
+	time_t secs;
+
+	/* first subtract and mask after that... Otherwise, if
+	   date == 0, bad things happen */
+	month = ((date >> 5) - 1) & 15;
+	year = date >> 9;
+	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
+	    ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
+						   month < 2 ? 1 : 0) + 3653);
+	/* days since 1.1.70 plus 80's leap day */
+	return local2utc(server, secs);
+}
+
+
+/* Convert linear UNIX date to a MS-DOS time/date pair. */
+
+static void
+date_unix2dos(struct smb_sb_info *server,
+	      int unix_date, __u16 *date, __u16 *time)
+{
+	int day, year, nl_day, month;
+
+	unix_date = utc2local(server, unix_date);
+	if (unix_date < 315532800)
+		unix_date = 315532800;
+
+	*time = (unix_date % 60) / 2 +
+		(((unix_date / 60) % 60) << 5) +
+		(((unix_date / 3600) % 24) << 11);
+
+	day = unix_date / 86400 - 3652;
+	year = day / 365;
+	if ((year + 3) / 4 + 365 * year > day)
+		year--;
+	day -= (year + 3) / 4 + 365 * year;
+	if (day == 59 && !(year & 3)) {
+		nl_day = day;
+		month = 2;
+	} else {
+		nl_day = (year & 3) || day <= 59 ? day : day - 1;
+		for (month = 1; month < 12; month++)
+			if (day_n[month] > nl_day)
+				break;
+	}
+	*date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
+}
+
+/* The following are taken from fs/ntfs/util.c */
+
+#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
+
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
+static struct timespec
+smb_ntutc2unixutc(u64 ntutc)
+{
+	struct timespec ts;
+	/* FIXME: what about the timezone difference? */
+	/* Subtract the NTFS time offset, then convert to 1s intervals. */
+	u64 t = ntutc - NTFS_TIME_OFFSET;
+	ts.tv_nsec = do_div(t, 10000000) * 100;
+	ts.tv_sec = t; 
+	return ts;
+}
+
+/* Convert the Unix UTC into NT time */
+static u64
+smb_unixutc2ntutc(struct timespec ts)
+{
+	/* Note: timezone conversion is probably wrong. */
+	/* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
+	return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
+}
+
+#define MAX_FILE_MODE	6
+static mode_t file_mode[] = {
+	S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
+};
+
+static int smb_filetype_to_mode(u32 filetype)
+{
+	if (filetype > MAX_FILE_MODE) {
+		PARANOIA("Filetype out of range: %d\n", filetype);
+		return S_IFREG;
+	}
+	return file_mode[filetype];
+}
+
+static u32 smb_filetype_from_mode(int mode)
+{
+	if (S_ISREG(mode))
+		return UNIX_TYPE_FILE;
+	if (S_ISDIR(mode))
+		return UNIX_TYPE_DIR;
+	if (S_ISLNK(mode))
+		return UNIX_TYPE_SYMLINK;
+	if (S_ISCHR(mode))
+		return UNIX_TYPE_CHARDEV;
+	if (S_ISBLK(mode))
+		return UNIX_TYPE_BLKDEV;
+	if (S_ISFIFO(mode))
+		return UNIX_TYPE_FIFO;
+	if (S_ISSOCK(mode))
+		return UNIX_TYPE_SOCKET;
+	return UNIX_TYPE_UNKNOWN;
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Support section.                                                         */
+/*                                                                           */
+/*****************************************************************************/
+
+__u32
+smb_len(__u8 * p)
+{
+	return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
+}
+
+static __u16
+smb_bcc(__u8 * packet)
+{
+	int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
+	return WVAL(packet, pos);
+}
+
+/* smb_valid_packet: We check if packet fulfills the basic
+   requirements of a smb packet */
+
+static int
+smb_valid_packet(__u8 * packet)
+{
+	return (packet[4] == 0xff
+		&& packet[5] == 'S'
+		&& packet[6] == 'M'
+		&& packet[7] == 'B'
+		&& (smb_len(packet) + 4 == SMB_HEADER_LEN
+		    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
+}
+
+/* smb_verify: We check if we got the answer we expected, and if we
+   got enough data. If bcc == -1, we don't care. */
+
+static int
+smb_verify(__u8 * packet, int command, int wct, int bcc)
+{
+	if (SMB_CMD(packet) != command)
+		goto bad_command;
+	if (SMB_WCT(packet) < wct)
+		goto bad_wct;
+	if (bcc != -1 && smb_bcc(packet) < bcc)
+		goto bad_bcc;
+	return 0;
+
+bad_command:
+	printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
+	       command, SMB_CMD(packet));
+	goto fail;
+bad_wct:
+	printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
+	       command, wct, SMB_WCT(packet));
+	goto fail;
+bad_bcc:
+	printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
+	       command, bcc, smb_bcc(packet));
+fail:
+	return -EIO;
+}
+
+/*
+ * Returns the maximum read or write size for the "payload". Making all of the
+ * packet fit within the negotiated max_xmit size.
+ *
+ * N.B. Since this value is usually computed before locking the server,
+ * the server's packet size must never be decreased!
+ */
+static inline int
+smb_get_xmitsize(struct smb_sb_info *server, int overhead)
+{
+	return server->opt.max_xmit - overhead;
+}
+
+/*
+ * Calculate the maximum read size
+ */
+int
+smb_get_rsize(struct smb_sb_info *server)
+{
+	/* readX has 12 parameters, read has 5 */
+	int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
+	int size = smb_get_xmitsize(server, overhead);
+
+	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
+
+	return size;
+}
+
+/*
+ * Calculate the maximum write size
+ */
+int
+smb_get_wsize(struct smb_sb_info *server)
+{
+	/* writeX has 14 parameters, write has 5 */
+	int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
+	int size = smb_get_xmitsize(server, overhead);
+
+	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
+
+	return size;
+}
+
+/*
+ * Convert SMB error codes to -E... errno values.
+ */
+int
+smb_errno(struct smb_request *req)
+{
+	int errcls = req->rq_rcls;
+	int error  = req->rq_err;
+	char *class = "Unknown";
+
+	VERBOSE("errcls %d  code %d  from command 0x%x\n",
+		errcls, error, SMB_CMD(req->rq_header));
+
+	if (errcls == ERRDOS) {
+		switch (error) {
+		case ERRbadfunc:
+			return -EINVAL;
+		case ERRbadfile:
+		case ERRbadpath:
+			return -ENOENT;
+		case ERRnofids:
+			return -EMFILE;
+		case ERRnoaccess:
+			return -EACCES;
+		case ERRbadfid:
+			return -EBADF;
+		case ERRbadmcb:
+			return -EREMOTEIO;
+		case ERRnomem:
+			return -ENOMEM;
+		case ERRbadmem:
+			return -EFAULT;
+		case ERRbadenv:
+		case ERRbadformat:
+			return -EREMOTEIO;
+		case ERRbadaccess:
+			return -EACCES;
+		case ERRbaddata:
+			return -E2BIG;
+		case ERRbaddrive:
+			return -ENXIO;
+		case ERRremcd:
+			return -EREMOTEIO;
+		case ERRdiffdevice:
+			return -EXDEV;
+		case ERRnofiles:
+			return -ENOENT;
+		case ERRbadshare:
+			return -ETXTBSY;
+		case ERRlock:
+			return -EDEADLK;
+		case ERRfilexists:
+			return -EEXIST;
+		case ERROR_INVALID_PARAMETER:
+			return -EINVAL;
+		case ERROR_DISK_FULL:
+			return -ENOSPC;
+		case ERROR_INVALID_NAME:
+			return -ENOENT;
+		case ERROR_DIR_NOT_EMPTY:
+			return -ENOTEMPTY;
+		case ERROR_NOT_LOCKED:
+                       return -ENOLCK;
+		case ERROR_ALREADY_EXISTS:
+			return -EEXIST;
+		default:
+			class = "ERRDOS";
+			goto err_unknown;
+		}
+	} else if (errcls == ERRSRV) {
+		switch (error) {
+		/* N.B. This is wrong ... EIO ? */
+		case ERRerror:
+			return -ENFILE;
+		case ERRbadpw:
+			return -EINVAL;
+		case ERRbadtype:
+		case ERRtimeout:
+			return -EIO;
+		case ERRaccess:
+			return -EACCES;
+		/*
+		 * This is a fatal error, as it means the "tree ID"
+		 * for this connection is no longer valid. We map
+		 * to a special error code and get a new connection.
+		 */
+		case ERRinvnid:
+			return -EBADSLT;
+		default:
+			class = "ERRSRV";
+			goto err_unknown;
+		}
+	} else if (errcls == ERRHRD) {
+		switch (error) {
+		case ERRnowrite:
+			return -EROFS;
+		case ERRbadunit:
+			return -ENODEV;
+		case ERRnotready:
+			return -EUCLEAN;
+		case ERRbadcmd:
+		case ERRdata:
+			return -EIO;
+		case ERRbadreq:
+			return -ERANGE;
+		case ERRbadshare:
+			return -ETXTBSY;
+		case ERRlock:
+			return -EDEADLK;
+		case ERRdiskfull:
+			return -ENOSPC;
+		default:
+			class = "ERRHRD";
+			goto err_unknown;
+		}
+	} else if (errcls == ERRCMD) {
+		class = "ERRCMD";
+	} else if (errcls == SUCCESS) {
+		return 0;	/* This is the only valid 0 return */
+	}
+
+err_unknown:
+	printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
+	       class, error, SMB_CMD(req->rq_header));
+	return -EIO;
+}
+
+/* smb_request_ok: We expect the server to be locked. Then we do the
+   request and check the answer completely. When smb_request_ok
+   returns 0, you can be quite sure that everything went well. When
+   the answer is <=0, the returned number is a valid unix errno. */
+
+static int
+smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
+{
+	int result;
+
+	req->rq_resp_wct = wct;
+	req->rq_resp_bcc = bcc;
+
+	result = smb_add_request(req);
+	if (result != 0) {
+		DEBUG1("smb_request failed\n");
+		goto out;
+	}
+
+	if (smb_valid_packet(req->rq_header) != 0) {
+		PARANOIA("invalid packet!\n");
+		goto out;
+	}
+
+	result = smb_verify(req->rq_header, command, wct, bcc);
+
+out:
+	return result;
+}
+
+/*
+ * This implements the NEWCONN ioctl. It installs the server pid,
+ * sets server->state to CONN_VALID, and wakes up the waiting process.
+ */
+int
+smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
+{
+	struct file *filp;
+	struct sock *sk;
+	int error;
+
+	VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
+
+	smb_lock_server(server);
+
+	/*
+	 * Make sure we don't already have a valid connection ...
+	 */
+	error = -EINVAL;
+	if (server->state == CONN_VALID)
+		goto out;
+
+	error = -EACCES;
+	if (current_uid() != server->mnt->mounted_uid &&
+	    !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	error = -EBADF;
+	filp = fget(opt->fd);
+	if (!filp)
+		goto out;
+	if (!smb_valid_socket(filp->f_path.dentry->d_inode))
+		goto out_putf;
+
+	server->sock_file = filp;
+	server->conn_pid = get_pid(task_pid(current));
+	server->opt = *opt;
+	server->generation += 1;
+	server->state = CONN_VALID;
+	error = 0;
+
+	if (server->conn_error) {
+		/*
+		 * conn_error is the returncode we originally decided to
+		 * drop the old connection on. This message should be positive
+		 * and not make people ask questions on why smbfs is printing
+		 * error messages ...
+		 */
+		printk(KERN_INFO "SMB connection re-established (%d)\n",
+		       server->conn_error);
+		server->conn_error = 0;
+	}
+
+	/*
+	 * Store the server in sock user_data (Only used by sunrpc)
+	 */
+	sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
+	sk->sk_user_data = server;
+
+	/* chain into the data_ready callback */
+	server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
+
+	/* check if we have an old smbmount that uses seconds for the 
+	   serverzone */
+	if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
+		server->opt.serverzone /= 60;
+
+	/* now that we have an established connection we can detect the server
+	   type and enable bug workarounds */
+	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
+		install_ops(server->ops, &smb_ops_core);
+	else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
+		install_ops(server->ops, &smb_ops_os2);
+	else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
+		 (server->opt.max_xmit < 0x1000) &&
+		 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
+		/* FIXME: can we kill the WIN95 flag now? */
+		server->mnt->flags |= SMB_MOUNT_WIN95;
+		VERBOSE("detected WIN95 server\n");
+		install_ops(server->ops, &smb_ops_win95);
+	} else {
+		/*
+		 * Samba has max_xmit 65535
+		 * NT4spX has max_xmit 4536 (or something like that)
+		 * win2k has ...
+		 */
+		VERBOSE("detected NT1 (Samba, NT4/5) server\n");
+		install_ops(server->ops, &smb_ops_winNT);
+	}
+
+	/* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
+	if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
+		server->ops->getattr = smb_proc_getattr_core;
+	} else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
+		server->ops->getattr = smb_proc_getattr_ff;
+	}
+
+	/* Decode server capabilities */
+	if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
+		/* Should be ok to set this now, as no one can access the
+		   mount until the connection has been established. */
+		SB_of(server)->s_maxbytes = ~0ULL >> 1;
+		VERBOSE("LFS enabled\n");
+	}
+	if (server->opt.capabilities & SMB_CAP_UNICODE) {
+		server->mnt->flags |= SMB_MOUNT_UNICODE;
+		VERBOSE("Unicode enabled\n");
+	} else {
+		server->mnt->flags &= ~SMB_MOUNT_UNICODE;
+	}
+#if 0
+	/* flags we may test for other patches ... */
+	if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
+		VERBOSE("Large reads enabled\n");
+	}
+	if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
+		VERBOSE("Large writes enabled\n");
+	}
+#endif
+	if (server->opt.capabilities & SMB_CAP_UNIX) {
+		struct inode *inode;
+		VERBOSE("Using UNIX CIFS extensions\n");
+		install_ops(server->ops, &smb_ops_unix);
+		inode = SB_of(server)->s_root->d_inode;
+		if (inode)
+			inode->i_op = &smb_dir_inode_operations_unix;
+	}
+
+	VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
+		server->opt.protocol, server->opt.max_xmit,
+		pid_nr(server->conn_pid), server->opt.capabilities);
+
+	/* FIXME: this really should be done by smbmount. */
+	if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
+		server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
+	}
+
+	smb_unlock_server(server);
+	smbiod_wake_up();
+	if (server->opt.capabilities & SMB_CAP_UNIX)
+		smb_proc_query_cifsunix(server);
+
+	server->conn_complete++;
+	wake_up_interruptible_all(&server->conn_wq);
+	return error;
+
+out:
+	smb_unlock_server(server);
+	smbiod_wake_up();
+	return error;
+
+out_putf:
+	fput(filp);
+	goto out;
+}
+
+/* smb_setup_header: We completely set up the packet. You only have to
+   insert the command-specific fields */
+
+__u8 *
+smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
+{
+	__u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
+	__u8 *p = req->rq_header;
+	struct smb_sb_info *server = req->rq_server;
+
+	p = smb_encode_smb_length(p, xmit_len - 4);
+
+	*p++ = 0xff;
+	*p++ = 'S';
+	*p++ = 'M';
+	*p++ = 'B';
+	*p++ = command;
+
+	memset(p, '\0', 19);
+	p += 19;
+	p += 8;
+
+	if (server->opt.protocol > SMB_PROTOCOL_CORE) {
+		int flags = SMB_FLAGS_CASELESS_PATHNAMES;
+		int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
+			SMB_FLAGS2_EXTENDED_ATTRIBUTES;	/* EA? not really ... */
+
+		*(req->rq_header + smb_flg) = flags;
+		if (server->mnt->flags & SMB_MOUNT_UNICODE)
+			flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
+		WSET(req->rq_header, smb_flg2, flags2);
+	}
+	*p++ = wct;		/* wct */
+	p += 2 * wct;
+	WSET(p, 0, bcc);
+
+	/* Include the header in the data to send */
+	req->rq_iovlen = 1;
+	req->rq_iov[0].iov_base = req->rq_header;
+	req->rq_iov[0].iov_len  = xmit_len - bcc;
+
+	return req->rq_buffer;
+}
+
+static void
+smb_setup_bcc(struct smb_request *req, __u8 *p)
+{
+	u16 bcc = p - req->rq_buffer;
+	u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
+
+	WSET(pbcc, 0, bcc);
+
+	smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
+			      2*SMB_WCT(req->rq_header) - 2 + bcc);
+
+	/* Include the "bytes" in the data to send */
+	req->rq_iovlen = 2;
+	req->rq_iov[1].iov_base = req->rq_buffer;
+	req->rq_iov[1].iov_len  = bcc;
+}
+
+static int
+smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
+	      __u16 mode, off_t offset)
+{
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBlseek, 4, 0);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	WSET(req->rq_header, smb_vwv1, mode);
+	DSET(req->rq_header, smb_vwv2, offset);
+	req->rq_flags |= SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBlseek, 2, 0);
+	if (result < 0) {
+		result = 0;
+		goto out_free;
+	}
+
+	result = DVAL(req->rq_header, smb_vwv0);
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
+{
+	struct inode *ino = dentry->d_inode;
+	struct smb_inode_info *ei = SMB_I(ino);
+	int mode, read_write = 0x42, read_only = 0x40;
+	int res;
+	char *p;
+	struct smb_request *req;
+
+	/*
+	 * Attempt to open r/w, unless there are no write privileges.
+	 */
+	mode = read_write;
+	if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
+		mode = read_only;
+#if 0
+	/* FIXME: why is this code not in? below we fix it so that a caller
+	   wanting RO doesn't get RW. smb_revalidate_inode does some 
+	   optimization based on access mode. tail -f needs it to be correct.
+
+	   We must open rw since we don't do the open if called a second time
+	   with different 'wish'. Is that not supported by smb servers? */
+	if (!(wish & (O_WRONLY | O_RDWR)))
+		mode = read_only;
+#endif
+
+	res = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+      retry:
+	p = smb_setup_header(req, SMBopen, 2, 0);
+	WSET(req->rq_header, smb_vwv0, mode);
+	WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
+	res = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (res < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	res = smb_request_ok(req, SMBopen, 7, 0);
+	if (res != 0) {
+		if (mode == read_write &&
+		    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
+		{
+			VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
+				DENTRY_PATH(dentry), res);
+			mode = read_only;
+			req->rq_flags = 0;
+			goto retry;
+		}
+		goto out_free;
+	}
+	/* We should now have data in vwv[0..6]. */
+
+	ei->fileid = WVAL(req->rq_header, smb_vwv0);
+	ei->attr   = WVAL(req->rq_header, smb_vwv1);
+	/* smb_vwv2 has mtime */
+	/* smb_vwv4 has size  */
+	ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
+	ei->open = server->generation;
+
+out_free:
+	smb_rput(req);
+out:
+	return res;
+}
+
+/*
+ * Make sure the file is open, and check that the access
+ * is compatible with the desired access.
+ */
+int
+smb_open(struct dentry *dentry, int wish)
+{
+	struct inode *inode = dentry->d_inode;
+	int result;
+	__u16 access;
+
+	result = -ENOENT;
+	if (!inode) {
+		printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
+		       DENTRY_PATH(dentry));
+		goto out;
+	}
+
+	if (!smb_is_open(inode)) {
+		struct smb_sb_info *server = server_from_inode(inode);
+		result = 0;
+		if (!smb_is_open(inode))
+			result = smb_proc_open(server, dentry, wish);
+		if (result)
+			goto out;
+		/*
+		 * A successful open means the path is still valid ...
+		 */
+		smb_renew_times(dentry);
+	}
+
+	/*
+	 * Check whether the access is compatible with the desired mode.
+	 */
+	result = 0;
+	access = SMB_I(inode)->access;
+	if (access != wish && access != SMB_O_RDWR) {
+		PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
+			 DENTRY_PATH(dentry), access, wish);
+		result = -EACCES;
+	}
+out:
+	return result;
+}
+
+static int 
+smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
+{
+	struct smb_request *req;
+	int result = -ENOMEM;
+
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBclose, 3, 0);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_request_ok(req, SMBclose, 0, 0);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Win NT 4.0 has an apparent bug in that it fails to update the
+ * modify time when writing to a file. As a workaround, we update
+ * both modify and access time locally, and post the times to the
+ * server when closing the file.
+ */
+static int 
+smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
+{
+	struct smb_inode_info *ei = SMB_I(ino);
+	int result = 0;
+	if (smb_is_open(ino))
+	{
+		/*
+		 * We clear the open flag in advance, in case another
+ 		 * process observes the value while we block below.
+		 */
+		ei->open = 0;
+
+		/*
+		 * Kludge alert: SMB timestamps are accurate only to
+		 * two seconds ... round the times to avoid needless
+		 * cache invalidations!
+		 */
+		if (ino->i_mtime.tv_sec & 1) { 
+			ino->i_mtime.tv_sec--;
+			ino->i_mtime.tv_nsec = 0; 
+		}
+		if (ino->i_atime.tv_sec & 1) {
+			ino->i_atime.tv_sec--;
+			ino->i_atime.tv_nsec = 0;
+		}
+		/*
+		 * If the file is open with write permissions,
+		 * update the time stamps to sync mtime and atime.
+		 */
+		if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
+		    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
+		    !(ei->access == SMB_O_RDONLY))
+		{
+			struct smb_fattr fattr;
+			smb_get_inode_attr(ino, &fattr);
+			smb_proc_setattr_ext(server, ino, &fattr);
+		}
+
+		result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
+		/*
+		 * Force a revalidation after closing ... some servers
+		 * don't post the size until the file has been closed.
+		 */
+		if (server->opt.protocol < SMB_PROTOCOL_NT1)
+			ei->oldmtime = 0;
+		ei->closed = jiffies;
+	}
+	return result;
+}
+
+int
+smb_close(struct inode *ino)
+{
+	int result = 0;
+
+	if (smb_is_open(ino)) {
+		struct smb_sb_info *server = server_from_inode(ino);
+		result = smb_proc_close_inode(server, ino);
+	}
+	return result;
+}
+
+/*
+ * This is used to close a file following a failed instantiate.
+ * Since we don't have an inode, we can't use any of the above.
+ */
+int
+smb_close_fileid(struct dentry *dentry, __u16 fileid)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int result;
+
+	result = smb_proc_close(server, fileid, get_seconds());
+	return result;
+}
+
+/* In smb_proc_read and smb_proc_write we do not retry, because the
+   file-id would not be valid after a reconnection. */
+
+static void
+smb_proc_read_data(struct smb_request *req)
+{
+	req->rq_iov[0].iov_base = req->rq_buffer;
+	req->rq_iov[0].iov_len  = 3;
+
+	req->rq_iov[1].iov_base = req->rq_page;
+	req->rq_iov[1].iov_len  = req->rq_rsize;
+	req->rq_iovlen = 2;
+
+	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
+}
+
+static int
+smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	__u16 returned_count, data_len;
+	unsigned char *buf;
+	int result;
+	struct smb_request *req;
+	u8 rbuf[4];
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBread, 5, 0);
+	buf = req->rq_header;
+	WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
+	WSET(buf, smb_vwv1, count);
+	DSET(buf, smb_vwv2, offset);
+	WSET(buf, smb_vwv4, 0);
+
+	req->rq_page = data;
+	req->rq_rsize = count;
+	req->rq_callback = smb_proc_read_data;
+	req->rq_buffer = rbuf;
+	req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
+
+	result = smb_request_ok(req, SMBread, 5, -1);
+	if (result < 0)
+		goto out_free;
+	returned_count = WVAL(req->rq_header, smb_vwv0);
+
+	data_len = WVAL(rbuf, 1);
+
+	if (returned_count != data_len) {
+		printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
+		printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
+		       returned_count, data_len);
+	}
+	result = data_len;
+
+out_free:
+	smb_rput(req);
+out:
+	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, result);
+	return result;
+}
+
+static int
+smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result;
+	u16 fileid = SMB_I(inode)->fileid;
+	u8 buf[4];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
+		inode->i_ino, fileid, count, offset);
+
+	smb_setup_header(req, SMBwrite, 5, count + 3);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	WSET(req->rq_header, smb_vwv1, count);
+	DSET(req->rq_header, smb_vwv2, offset);
+	WSET(req->rq_header, smb_vwv4, 0);
+
+	buf[0] = 1;
+	WSET(buf, 1, count);	/* yes, again ... */
+	req->rq_iov[1].iov_base = buf;
+	req->rq_iov[1].iov_len = 3;
+	req->rq_iov[2].iov_base = (char *) data;
+	req->rq_iov[2].iov_len = count;
+	req->rq_iovlen = 3;
+	req->rq_flags |= SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBwrite, 1, 0);
+	if (result >= 0)
+		result = WVAL(req->rq_header, smb_vwv0);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * In smb_proc_readX and smb_proc_writeX we do not retry, because the
+ * file-id would not be valid after a reconnection.
+ */
+
+#define SMB_READX_MAX_PAD      64
+static void
+smb_proc_readX_data(struct smb_request *req)
+{
+	/* header length, excluding the netbios length (-4) */
+	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
+	int data_off = WVAL(req->rq_header, smb_vwv6);
+
+	/*
+	 * Some genius made the padding to the data bytes arbitrary.
+	 * So we must first calculate the amount of padding used by the server.
+	 */
+	data_off -= hdrlen;
+	if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
+		PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
+		PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
+		req->rq_rlen = req->rq_bufsize + 1;
+		return;
+	}
+	req->rq_iov[0].iov_base = req->rq_buffer;
+	req->rq_iov[0].iov_len  = data_off;
+
+	req->rq_iov[1].iov_base = req->rq_page;
+	req->rq_iov[1].iov_len  = req->rq_rsize;
+	req->rq_iovlen = 2;
+
+	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
+}
+
+static int
+smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	unsigned char *buf;
+	int result;
+	struct smb_request *req;
+	static char pad[SMB_READX_MAX_PAD];
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBreadX, 12, 0);
+	buf = req->rq_header;
+	WSET(buf, smb_vwv0, 0x00ff);
+	WSET(buf, smb_vwv1, 0);
+	WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
+	DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
+	WSET(buf, smb_vwv5, count);
+	WSET(buf, smb_vwv6, 0);
+	DSET(buf, smb_vwv7, 0);
+	WSET(buf, smb_vwv9, 0);
+	DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
+	WSET(buf, smb_vwv11, 0);
+
+	req->rq_page = data;
+	req->rq_rsize = count;
+	req->rq_callback = smb_proc_readX_data;
+	req->rq_buffer = pad;
+	req->rq_bufsize = SMB_READX_MAX_PAD;
+	req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBreadX, 12, -1);
+	if (result < 0)
+		goto out_free;
+	result = WVAL(req->rq_header, smb_vwv5);
+
+out_free:
+	smb_rput(req);
+out:
+	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, result);
+	return result;
+}
+
+static int
+smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result;
+	u8 *p;
+	static u8 pad[4];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
+		inode->i_ino, SMB_I(inode)->fileid, count, offset);
+
+	p = smb_setup_header(req, SMBwriteX, 14, count + 1);
+	WSET(req->rq_header, smb_vwv0, 0x00ff);
+	WSET(req->rq_header, smb_vwv1, 0);
+	WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
+	DSET(req->rq_header, smb_vwv3, (u32)offset);	/* low 32 bits */
+	DSET(req->rq_header, smb_vwv5, 0);
+	WSET(req->rq_header, smb_vwv7, 0);		/* write mode */
+	WSET(req->rq_header, smb_vwv8, 0);
+	WSET(req->rq_header, smb_vwv9, 0);
+	WSET(req->rq_header, smb_vwv10, count);		/* data length */
+	WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
+	DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
+
+	req->rq_iov[1].iov_base = pad;
+	req->rq_iov[1].iov_len = 1;
+	req->rq_iov[2].iov_base = (char *) data;
+	req->rq_iov[2].iov_len = count;
+	req->rq_iovlen = 3;
+	req->rq_flags |= SMB_REQ_NORETRY;
+
+	result = smb_request_ok(req, SMBwriteX, 6, 0);
+ 	if (result >= 0)
+		result = WVAL(req->rq_header, smb_vwv2);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBcreate, 3, 0);
+	WSET(req->rq_header, smb_vwv0, attr);
+	DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	result = smb_request_ok(req, SMBcreate, 1, 0);
+	if (result < 0)
+		goto out_free;
+
+	*fileid = WVAL(req->rq_header, smb_vwv0);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	struct smb_sb_info *server = server_from_dentry(old_dentry);
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBmv, 1, 0);
+	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
+	result = smb_simple_encode_path(req, &p, old_dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	result = smb_simple_encode_path(req, &p, new_dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
+		goto out_free;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Code common to mkdir and rmdir.
+ */
+static int
+smb_proc_generic_command(struct dentry *dentry, __u8 command)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, command, 0, 0);
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	result = smb_request_ok(req, command, 0, 0);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_mkdir(struct dentry *dentry)
+{
+	return smb_proc_generic_command(dentry, SMBmkdir);
+}
+
+int
+smb_proc_rmdir(struct dentry *dentry)
+{
+	return smb_proc_generic_command(dentry, SMBrmdir);
+}
+
+#if SMBFS_POSIX_UNLINK
+/*
+ * Removes readonly attribute from a file. Used by unlink to give posix
+ * semantics.
+ */
+static int
+smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
+{
+	int result;
+	struct smb_fattr fattr;
+
+	/* FIXME: cifsUE should allow removing a readonly file. */
+
+	/* first get current attribute */
+	smb_init_dirent(server, &fattr);
+	result = server->ops->getattr(server, dentry, &fattr);
+	smb_finish_dirent(server, &fattr);
+	if (result < 0)
+		return result;
+
+	/* if RONLY attribute is set, remove it */
+	if (fattr.attr & aRONLY) {  /* read only attribute is set */
+		fattr.attr &= ~aRONLY;
+		result = smb_proc_setattr_core(server, dentry, fattr.attr);
+	}
+	return result;
+}
+#endif
+
+int
+smb_proc_unlink(struct dentry *dentry)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	int flag = 0;
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+      retry:
+	p = smb_setup_header(req, SMBunlink, 1, 0);
+	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	smb_setup_bcc(req, p);
+
+	if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
+#if SMBFS_POSIX_UNLINK
+		if (result == -EACCES && !flag) {
+			/* Posix semantics is for the read-only state
+			   of a file to be ignored in unlink(). In the
+			   SMB world a unlink() is refused on a
+			   read-only file. To make things easier for
+			   unix users we try to override the files
+			   permission if the unlink fails with the
+			   right error.
+			   This introduces a race condition that could
+			   lead to a file being written by someone who
+			   shouldn't have access, but as far as I can
+			   tell that is unavoidable */
+
+			/* remove RONLY attribute and try again */
+			result = smb_set_rw(dentry,server);
+			if (result == 0) {
+				flag = 1;
+				req->rq_flags = 0;
+				goto retry;
+			}
+		}
+#endif
+		goto out_free;
+	}
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
+{
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBflush, 1, 0);
+	WSET(req->rq_header, smb_vwv0, fileid);
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_request_ok(req, SMBflush, 0, 0);
+
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_trunc32(struct inode *inode, loff_t length)
+{
+	/*
+	 * Writing 0bytes is old-SMB magic for truncating files.
+	 * MAX_NON_LFS should prevent this from being called with a too
+	 * large offset.
+	 */
+	return smb_proc_write(inode, length, 0, NULL);
+}
+
+static int
+smb_proc_trunc64(struct inode *inode, loff_t length)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result;
+	char *param;
+	char *data;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 14)))
+		goto out;
+
+	param = req->rq_buffer;
+	data = req->rq_buffer + 6;
+
+	/* FIXME: must we also set allocation size? winNT seems to do that */
+	WSET(param, 0, SMB_I(inode)->fileid);
+	WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
+	WSET(param, 4, 0);
+	LSET(data, 0, length);
+
+	req->rq_trans2_command = TRANSACT2_SETFILEINFO;
+	req->rq_ldata = 8;
+	req->rq_data  = data;
+	req->rq_lparm = 6;
+	req->rq_parm  = param;
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	result = 0;
+	if (req->rq_rcls != 0)
+		result = smb_errno(req);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_trunc95(struct inode *inode, loff_t length)
+{
+	struct smb_sb_info *server = server_from_inode(inode);
+	int result = smb_proc_trunc32(inode, length);
+ 
+	/*
+	 * win9x doesn't appear to update the size immediately.
+	 * It will return the old file size after the truncate,
+	 * confusing smbfs. So we force an update.
+	 *
+	 * FIXME: is this still necessary?
+	 */
+	smb_proc_flush(server, SMB_I(inode)->fileid);
+	return result;
+}
+
+static void
+smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
+{
+	memset(fattr, 0, sizeof(*fattr));
+
+	fattr->f_nlink = 1;
+	fattr->f_uid = server->mnt->uid;
+	fattr->f_gid = server->mnt->gid;
+	fattr->f_unix = 0;
+}
+
+static void
+smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
+{
+	if (fattr->f_unix)
+		return;
+
+	fattr->f_mode = server->mnt->file_mode;
+	if (fattr->attr & aDIR) {
+		fattr->f_mode = server->mnt->dir_mode;
+		fattr->f_size = SMB_ST_BLKSIZE;
+	}
+	/* Check the read-only flag */
+	if (fattr->attr & aRONLY)
+		fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
+
+	/* How many 512 byte blocks do we need for this file? */
+	fattr->f_blocks = 0;
+	if (fattr->f_size != 0)
+		fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
+	return;
+}
+
+void
+smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
+		     struct super_block *sb)
+{
+	smb_init_dirent(server, fattr);
+	fattr->attr = aDIR;
+	fattr->f_ino = 2; /* traditional root inode number */
+	fattr->f_mtime = current_fs_time(sb);
+	smb_finish_dirent(server, fattr);
+}
+
+/*
+ * Decode a dirent for old protocols
+ *
+ * qname is filled with the decoded, and possibly translated, name.
+ * fattr receives decoded attributes
+ *
+ * Bugs Noted:
+ * (1) Pathworks servers may pad the name with extra spaces.
+ */
+static char *
+smb_decode_short_dirent(struct smb_sb_info *server, char *p,
+			struct qstr *qname, struct smb_fattr *fattr,
+			unsigned char *name_buf)
+{
+	int len;
+
+	/*
+	 * SMB doesn't have a concept of inode numbers ...
+	 */
+	smb_init_dirent(server, fattr);
+	fattr->f_ino = 0;	/* FIXME: do we need this? */
+
+	p += SMB_STATUS_SIZE;	/* reserved (search_status) */
+	fattr->attr = *p;
+	fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
+	fattr->f_mtime.tv_nsec = 0;
+	fattr->f_size = DVAL(p, 5);
+	fattr->f_ctime = fattr->f_mtime;
+	fattr->f_atime = fattr->f_mtime;
+	qname->name = p + 9;
+	len = strnlen(qname->name, 12);
+
+	/*
+	 * Trim trailing blanks for Pathworks servers
+	 */
+	while (len > 2 && qname->name[len-1] == ' ')
+		len--;
+
+	smb_finish_dirent(server, fattr);
+
+#if 0
+	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
+	   allow the flag to be set anyway. It kills const. Remove? */
+	switch (server->opt.case_handling) {
+	case SMB_CASE_UPPER:
+		str_upper(entry->name, len);
+		break;
+	case SMB_CASE_LOWER:
+		str_lower(entry->name, len);
+		break;
+	default:
+		break;
+	}
+#endif
+
+	qname->len = 0;
+	len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
+				   qname->name, len,
+				   server->remote_nls, server->local_nls);
+	if (len > 0) {
+		qname->len = len;
+		qname->name = name_buf;
+		DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
+	}
+
+	return p + 22;
+}
+
+/*
+ * This routine is used to read in directory entries from the network.
+ * Note that it is for short directory name seeks, i.e.: protocol <
+ * SMB_PROTOCOL_LANMAN2
+ */
+static int
+smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
+		       struct smb_cache_control *ctl)
+{
+	struct dentry *dir = filp->f_path.dentry;
+	struct smb_sb_info *server = server_from_dentry(dir);
+	struct qstr qname;
+	struct smb_fattr fattr;
+	char *p;
+	int result;
+	int i, first, entries_seen, entries;
+	int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
+	__u16 bcc;
+	__u16 count;
+	char status[SMB_STATUS_SIZE];
+	static struct qstr mask = {
+		.name	= "*.*",
+		.len	= 3,
+	};
+	unsigned char *last_status;
+	struct smb_request *req;
+	unsigned char *name_buf;
+
+	VERBOSE("%s/%s\n", DENTRY_PATH(dir));
+
+	lock_kernel();
+
+	result = -ENOMEM;
+	if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
+		goto out;
+
+	first = 1;
+	entries = 0;
+	entries_seen = 2; /* implicit . and .. */
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
+		goto out_name;
+
+	while (1) {
+		p = smb_setup_header(req, SMBsearch, 2, 0);
+		WSET(req->rq_header, smb_vwv0, entries_asked);
+		WSET(req->rq_header, smb_vwv1, aDIR);
+		if (first == 1) {
+			result = smb_simple_encode_path(req, &p, dir, &mask);
+			if (result < 0)
+				goto out_free;
+			if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
+				result = -ENAMETOOLONG;
+				goto out_free;
+			}
+			*p++ = 5;
+			WSET(p, 0, 0);
+			p += 2;
+			first = 0;
+		} else {
+			if (p + 5 + SMB_STATUS_SIZE >
+			    (char *)req->rq_buffer + req->rq_bufsize) {
+				result = -ENAMETOOLONG;
+				goto out_free;
+			}
+				
+			*p++ = 4;
+			*p++ = 0;
+			*p++ = 5;
+			WSET(p, 0, SMB_STATUS_SIZE);
+			p += 2;
+			memcpy(p, status, SMB_STATUS_SIZE);
+			p += SMB_STATUS_SIZE;
+		}
+
+		smb_setup_bcc(req, p);
+
+		result = smb_request_ok(req, SMBsearch, 1, -1);
+		if (result < 0) {
+			if ((req->rq_rcls == ERRDOS) && 
+			    (req->rq_err  == ERRnofiles))
+				break;
+			goto out_free;
+		}
+		count = WVAL(req->rq_header, smb_vwv0);
+		if (count <= 0)
+			break;
+
+		result = -EIO;
+		bcc = smb_bcc(req->rq_header);
+		if (bcc != count * SMB_DIRINFO_SIZE + 3)
+			goto out_free;
+		p = req->rq_buffer + 3;
+
+
+		/* Make sure the response fits in the buffer. Fixed sized 
+		   entries means we don't have to check in the decode loop. */
+
+		last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
+
+		if (last_status + SMB_DIRINFO_SIZE >=
+		    req->rq_buffer + req->rq_bufsize) {
+			printk(KERN_ERR "smb_proc_readdir_short: "
+			       "last dir entry outside buffer! "
+			       "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
+			       req->rq_bufsize, req->rq_buffer);
+			goto out_free;
+		}
+
+		/* Read the last entry into the status field. */
+		memcpy(status, last_status, SMB_STATUS_SIZE);
+
+
+		/* Now we are ready to parse smb directory entries. */
+
+		for (i = 0; i < count; i++) {
+			p = smb_decode_short_dirent(server, p, 
+						    &qname, &fattr, name_buf);
+			if (qname.len == 0)
+				continue;
+
+			if (entries_seen == 2 && qname.name[0] == '.') {
+				if (qname.len == 1)
+					continue;
+				if (qname.name[1] == '.' && qname.len == 2)
+					continue;
+			}
+			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
+					    &qname, &fattr))
+				;	/* stop reading? */
+			entries_seen++;
+		}
+	}
+	result = entries;
+
+out_free:
+	smb_rput(req);
+out_name:
+	kfree(name_buf);
+out:
+	unlock_kernel();
+	return result;
+}
+
+static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
+{
+	u64 size, disk_bytes;
+
+	/* FIXME: verify nls support. all is sent as utf8? */
+
+	fattr->f_unix = 1;
+	fattr->f_mode = 0;
+
+	/* FIXME: use the uniqueID from the remote instead? */
+	/* 0 L file size in bytes */
+	/* 8 L file size on disk in bytes (block count) */
+	/* 40 L uid */
+	/* 48 L gid */
+	/* 56 W file type */
+	/* 60 L devmajor */
+	/* 68 L devminor */
+	/* 76 L unique ID (inode) */
+	/* 84 L permissions */
+	/* 92 L link count */
+
+	size = LVAL(p, 0);
+	disk_bytes = LVAL(p, 8);
+
+	/*
+	 * Some samba versions round up on-disk byte usage
+	 * to 1MB boundaries, making it useless. When seeing
+	 * that, use the size instead.
+	 */
+	if (!(disk_bytes & 0xfffff))
+		disk_bytes = size+511;
+
+	fattr->f_size = size;
+	fattr->f_blocks = disk_bytes >> 9;
+	fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
+	fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
+	fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
+
+	if (server->mnt->flags & SMB_MOUNT_UID)
+		fattr->f_uid = server->mnt->uid;
+	else
+		fattr->f_uid = LVAL(p, 40);
+
+	if (server->mnt->flags & SMB_MOUNT_GID)
+		fattr->f_gid = server->mnt->gid;
+	else
+		fattr->f_gid = LVAL(p, 48);
+
+	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
+
+	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
+		__u64 major = LVAL(p, 60);
+		__u64 minor = LVAL(p, 68);
+
+		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
+		if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
+	    	MINOR(fattr->f_rdev) != (minor & 0xffffffff))
+			fattr->f_rdev = 0;
+	}
+
+	fattr->f_mode |= LVAL(p, 84);
+
+	if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
+	     (S_ISDIR(fattr->f_mode)) )
+		fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
+	else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
+	          !(S_ISDIR(fattr->f_mode)) )
+		fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
+				(fattr->f_mode & S_IFMT);
+
+}
+
+/*
+ * Interpret a long filename structure using the specified info level:
+ *   level 1 for anything below NT1 protocol
+ *   level 260 for NT1 protocol
+ *
+ * qname is filled with the decoded, and possibly translated, name
+ * fattr receives decoded attributes.
+ *
+ * Bugs Noted:
+ * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
+ */
+static char *
+smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
+		       struct qstr *qname, struct smb_fattr *fattr,
+		       unsigned char *name_buf)
+{
+	char *result;
+	unsigned int len = 0;
+	int n;
+	__u16 date, time;
+	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
+
+	/*
+	 * SMB doesn't have a concept of inode numbers ...
+	 */
+	smb_init_dirent(server, fattr);
+	fattr->f_ino = 0;	/* FIXME: do we need this? */
+
+	switch (level) {
+	case 1:
+		len = *((unsigned char *) p + 22);
+		qname->name = p + 23;
+		result = p + 24 + len;
+
+		date = WVAL(p, 0);
+		time = WVAL(p, 2);
+		fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
+		fattr->f_ctime.tv_nsec = 0;
+
+		date = WVAL(p, 4);
+		time = WVAL(p, 6);
+		fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
+		fattr->f_atime.tv_nsec = 0;
+
+		date = WVAL(p, 8);
+		time = WVAL(p, 10);
+		fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
+		fattr->f_mtime.tv_nsec = 0;
+		fattr->f_size = DVAL(p, 12);
+		/* ULONG allocation size */
+		fattr->attr = WVAL(p, 20);
+
+		VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
+			p, len, len, qname->name);
+		break;
+	case 260:
+		result = p + WVAL(p, 0);
+		len = DVAL(p, 60);
+		if (len > 255) len = 255;
+		/* NT4 null terminates, unless we are using unicode ... */
+		qname->name = p + 94;
+		if (!unicode && len && qname->name[len-1] == '\0')
+			len--;
+
+		fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
+		fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
+		fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
+		/* change time (32) */
+		fattr->f_size = LVAL(p, 40);
+		/* alloc size (48) */
+		fattr->attr = DVAL(p, 56);
+
+		VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
+			p, len, len, qname->name);
+		break;
+	case SMB_FIND_FILE_UNIX:
+		result = p + WVAL(p, 0);
+		qname->name = p + 108;
+
+		len = strlen(qname->name);
+		/* FIXME: should we check the length?? */
+
+		p += 8;
+		smb_decode_unix_basic(fattr, server, p);
+		VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
+			p, len, len, qname->name);
+		break;
+	default:
+		PARANOIA("Unknown info level %d\n", level);
+		result = p + WVAL(p, 0);
+		goto out;
+	}
+
+	smb_finish_dirent(server, fattr);
+
+#if 0
+	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
+	   allow the flag to be set anyway. Remove? */
+	switch (server->opt.case_handling) {
+	case SMB_CASE_UPPER:
+		str_upper(qname->name, len);
+		break;
+	case SMB_CASE_LOWER:
+		str_lower(qname->name, len);
+		break;
+	default:
+		break;
+	}
+#endif
+
+	qname->len = 0;
+	n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
+				 qname->name, len,
+				 server->remote_nls, server->local_nls);
+	if (n > 0) {
+		qname->len = n;
+		qname->name = name_buf;
+	}
+
+out:
+	return result;
+}
+
+/* findfirst/findnext flags */
+#define SMB_CLOSE_AFTER_FIRST (1<<0)
+#define SMB_CLOSE_IF_END (1<<1)
+#define SMB_REQUIRE_RESUME_KEY (1<<2)
+#define SMB_CONTINUE_BIT (1<<3)
+
+/*
+ * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
+ * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
+ * go there for advise.
+ *
+ * Bugs Noted:
+ * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
+ * for certain patterns of names and/or lengths. The breakage pattern
+ * is completely reproducible and can be toggled by the creation of a
+ * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
+ */
+static int
+smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
+		      struct smb_cache_control *ctl)
+{
+	struct dentry *dir = filp->f_path.dentry;
+	struct smb_sb_info *server = server_from_dentry(dir);
+	struct qstr qname;
+	struct smb_fattr fattr;
+
+	unsigned char *p, *lastname;
+	char *mask, *param;
+	__u16 command;
+	int first, entries_seen;
+
+	/* Both NT and OS/2 accept info level 1 (but see note below). */
+	int info_level = 260;
+	const int max_matches = 512;
+
+	unsigned int ff_searchcount = 0;
+	unsigned int ff_eos = 0;
+	unsigned int ff_lastname = 0;
+	unsigned int ff_dir_handle = 0;
+	unsigned int loop_count = 0;
+	unsigned int mask_len, i;
+	int result;
+	struct smb_request *req;
+	unsigned char *name_buf;
+	static struct qstr star = {
+		.name	= "*",
+		.len	= 1,
+	};
+
+	lock_kernel();
+
+	/*
+	 * We always prefer unix style. Use info level 1 for older
+	 * servers that don't do 260.
+	 */
+	if (server->opt.capabilities & SMB_CAP_UNIX)
+		info_level = SMB_FIND_FILE_UNIX;
+	else if (server->opt.protocol < SMB_PROTOCOL_NT1)
+		info_level = 1;
+
+	result = -ENOMEM;
+	if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
+		goto out;
+	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
+		goto out_name;
+	param = req->rq_buffer;
+
+	/*
+	 * Encode the initial path
+	 */
+	mask = param + 12;
+
+	result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
+	if (result <= 0)
+		goto out_free;
+	mask_len = result - 1;	/* mask_len is strlen, not #bytes */
+	result = 0;
+	first = 1;
+	VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
+
+	entries_seen = 2;
+	ff_eos = 0;
+
+	while (ff_eos == 0) {
+		loop_count += 1;
+		if (loop_count > 10) {
+			printk(KERN_WARNING "smb_proc_readdir_long: "
+			       "Looping in FIND_NEXT??\n");
+			result = -EIO;
+			break;
+		}
+
+		if (first != 0) {
+			command = TRANSACT2_FINDFIRST;
+			WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
+			WSET(param, 2, max_matches);	/* max count */
+			WSET(param, 4, SMB_CLOSE_IF_END);
+			WSET(param, 6, info_level);
+			DSET(param, 8, 0);
+		} else {
+			command = TRANSACT2_FINDNEXT;
+
+			VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
+				ff_dir_handle, ff_lastname, mask_len, mask);
+
+			WSET(param, 0, ff_dir_handle);	/* search handle */
+			WSET(param, 2, max_matches);	/* max count */
+			WSET(param, 4, info_level);
+			DSET(param, 6, 0);
+			WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
+		}
+
+		req->rq_trans2_command = command;
+		req->rq_ldata = 0;
+		req->rq_data  = NULL;
+		req->rq_lparm = 12 + mask_len + 1;
+		req->rq_parm  = param;
+		req->rq_flags = 0;
+		result = smb_add_request(req);
+		if (result < 0) {
+			PARANOIA("error=%d, breaking\n", result);
+			break;
+		}
+
+		if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
+			/* a damn Win95 bug - sometimes it clags if you 
+			   ask it too fast */
+			schedule_timeout_interruptible(msecs_to_jiffies(200));
+			continue;
+                }
+
+		if (req->rq_rcls != 0) {
+			result = smb_errno(req);
+			PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
+				 mask, result, req->rq_rcls, req->rq_err);
+			break;
+		}
+
+		/* parse out some important return info */
+		if (first != 0) {
+			ff_dir_handle = WVAL(req->rq_parm, 0);
+			ff_searchcount = WVAL(req->rq_parm, 2);
+			ff_eos = WVAL(req->rq_parm, 4);
+			ff_lastname = WVAL(req->rq_parm, 8);
+		} else {
+			ff_searchcount = WVAL(req->rq_parm, 0);
+			ff_eos = WVAL(req->rq_parm, 2);
+			ff_lastname = WVAL(req->rq_parm, 6);
+		}
+
+		if (ff_searchcount == 0)
+			break;
+
+		/* Now we are ready to parse smb directory entries. */
+
+		/* point to the data bytes */
+		p = req->rq_data;
+		for (i = 0; i < ff_searchcount; i++) {
+			/* make sure we stay within the buffer */
+			if (p >= req->rq_data + req->rq_ldata) {
+				printk(KERN_ERR "smb_proc_readdir_long: "
+				       "dirent pointer outside buffer! "
+				       "%p  %d@%p\n",
+				       p, req->rq_ldata, req->rq_data);
+				result = -EIO; /* always a comm. error? */
+				goto out_free;
+			}
+
+			p = smb_decode_long_dirent(server, p, info_level,
+						   &qname, &fattr, name_buf);
+
+			/* ignore . and .. from the server */
+			if (entries_seen == 2 && qname.name[0] == '.') {
+				if (qname.len == 1)
+					continue;
+				if (qname.name[1] == '.' && qname.len == 2)
+					continue;
+			}
+
+			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
+					    &qname, &fattr))
+				;	/* stop reading? */
+			entries_seen++;
+		}
+
+		VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
+
+		/*
+		 * We might need the lastname for continuations.
+		 *
+		 * Note that some servers (win95?) point to the filename and
+		 * others (NT4, Samba using NT1) to the dir entry. We assume
+		 * here that those who do not point to a filename do not need
+		 * this info to continue the listing.
+		 *
+		 * OS/2 needs this and talks infolevel 1.
+		 * NetApps want lastname with infolevel 260.
+		 * win2k want lastname with infolevel 260, and points to
+		 *       the record not to the name.
+		 * Samba+CifsUnixExt doesn't need lastname.
+		 *
+		 * Both are happy if we return the data they point to. So we do.
+		 * (FIXME: above is not true with win2k)
+		 */
+		mask_len = 0;
+		if (info_level != SMB_FIND_FILE_UNIX &&
+		    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
+			lastname = req->rq_data + ff_lastname;
+
+			switch (info_level) {
+			case 260:
+				mask_len = req->rq_ldata - ff_lastname;
+				break;
+			case 1:
+				/* lastname points to a length byte */
+				mask_len = *lastname++;
+				if (ff_lastname + 1 + mask_len > req->rq_ldata)
+					mask_len = req->rq_ldata - ff_lastname - 1;
+				break;
+			}
+
+			/*
+			 * Update the mask string for the next message.
+			 */
+			if (mask_len > 255)
+				mask_len = 255;
+			if (mask_len)
+				strncpy(mask, lastname, mask_len);
+		}
+		mask_len = strnlen(mask, mask_len);
+		VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
+			mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
+
+		first = 0;
+		loop_count = 0;
+	}
+
+out_free:
+	smb_rput(req);
+out_name:
+	kfree(name_buf);
+out:
+	unlock_kernel();
+	return result;
+}
+
+/*
+ * This version uses the trans2 TRANSACT2_FINDFIRST message 
+ * to get the attribute data.
+ *
+ * Bugs Noted:
+ */
+static int
+smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
+			struct smb_fattr *fattr)
+{
+	char *param, *mask;
+	__u16 date, time;
+	int mask_len, result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+	mask = param + 12;
+
+	mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
+	if (mask_len < 0) {
+		result = mask_len;
+		goto out_free;
+	}
+	VERBOSE("name=%s, len=%d\n", mask, mask_len);
+	WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
+	WSET(param, 2, 1);	/* max count */
+	WSET(param, 4, 1);	/* close after this call */
+	WSET(param, 6, 1);	/* info_level */
+	DSET(param, 8, 0);
+
+	req->rq_trans2_command = TRANSACT2_FINDFIRST;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = 12 + mask_len;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+	if (req->rq_rcls != 0) {
+		result = smb_errno(req);
+#ifdef SMBFS_PARANOIA
+		if (result != -ENOENT)
+			PARANOIA("error for %s, rcls=%d, err=%d\n",
+				 mask, req->rq_rcls, req->rq_err);
+#endif
+		goto out_free;
+	}
+	/* Make sure we got enough data ... */
+	result = -EINVAL;
+	if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
+		PARANOIA("bad result for %s, len=%d, count=%d\n",
+			 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
+		goto out_free;
+	}
+
+	/*
+	 * Decode the response into the fattr ...
+	 */
+	date = WVAL(req->rq_data, 0);
+	time = WVAL(req->rq_data, 2);
+	fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
+	fattr->f_ctime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 4);
+	time = WVAL(req->rq_data, 6);
+	fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
+	fattr->f_atime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 8);
+	time = WVAL(req->rq_data, 10);
+	fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
+	fattr->f_mtime.tv_nsec = 0;
+	VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
+		mask, date, time, fattr->f_mtime.tv_sec);
+	fattr->f_size = DVAL(req->rq_data, 12);
+	/* ULONG allocation size */
+	fattr->attr = WVAL(req->rq_data, 20);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
+		      struct smb_fattr *fattr)
+{
+	int result;
+	char *p;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBgetatr, 0, 0);
+	result = smb_simple_encode_path(req, &p, dir, NULL);
+	if (result < 0)
+ 		goto out_free;
+	smb_setup_bcc(req, p);
+
+	if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
+		goto out_free;
+	fattr->attr    = WVAL(req->rq_header, smb_vwv0);
+	fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
+	fattr->f_mtime.tv_nsec = 0;
+	fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
+	fattr->f_ctime = fattr->f_mtime; 
+	fattr->f_atime = fattr->f_mtime; 
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk("getattr_core: %s/%s, mtime=%ld\n",
+	       DENTRY_PATH(dir), fattr->f_mtime);
+#endif
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Bugs Noted:
+ * (1) Win 95 swaps the date and time fields in the standard info level.
+ */
+static int
+smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
+			struct smb_request *req, int infolevel)
+{
+	char *p, *param;
+	int result;
+
+	param = req->rq_buffer;
+	WSET(param, 0, infolevel);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
+	if (result < 0)
+		goto out;
+	p = param + 6 + result;
+
+	req->rq_trans2_command = TRANSACT2_QPATHINFO;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out;
+	if (req->rq_rcls != 0) {
+		VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
+			&param[6], result, req->rq_rcls, req->rq_err);
+		result = smb_errno(req);
+		goto out;
+	}
+	result = -ENOENT;
+	if (req->rq_ldata < 22) {
+		PARANOIA("not enough data for %s, len=%d\n",
+			 &param[6], req->rq_ldata);
+		goto out;
+	}
+
+	result = 0;
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
+			    struct smb_fattr *attr)
+{
+	u16 date, time;
+	int off_date = 0, off_time = 2;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
+	if (result < 0)
+		goto out_free;
+
+	/*
+	 * Kludge alert: Win 95 swaps the date and time field,
+	 * contrary to the CIFS docs and Win NT practice.
+	 */
+	if (server->mnt->flags & SMB_MOUNT_WIN95) {
+		off_date = 2;
+		off_time = 0;
+	}
+	date = WVAL(req->rq_data, off_date);
+	time = WVAL(req->rq_data, off_time);
+	attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
+	attr->f_ctime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 4 + off_date);
+	time = WVAL(req->rq_data, 4 + off_time);
+	attr->f_atime.tv_sec = date_dos2unix(server, date, time);
+	attr->f_atime.tv_nsec = 0;
+
+	date = WVAL(req->rq_data, 8 + off_date);
+	time = WVAL(req->rq_data, 8 + off_time);
+	attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
+	attr->f_mtime.tv_nsec = 0;
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
+	       DENTRY_PATH(dir), date, time, attr->f_mtime);
+#endif
+	attr->f_size = DVAL(req->rq_data, 12);
+	attr->attr = WVAL(req->rq_data, 20);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
+			    struct smb_fattr *attr)
+{
+	struct smb_request *req;
+	int result;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	result = smb_proc_getattr_trans2(server, dir, req,
+					 SMB_QUERY_FILE_ALL_INFO);
+	if (result < 0)
+		goto out_free;
+
+	attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
+	attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
+	attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
+	/* change (24) */
+	attr->attr = WVAL(req->rq_data, 32);
+	/* pad? (34) */
+	/* allocated size (40) */
+	attr->f_size = LVAL(req->rq_data, 48);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
+		      struct smb_fattr *attr)
+{
+	struct smb_request *req;
+	int result;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	result = smb_proc_getattr_trans2(server, dir, req,
+					 SMB_QUERY_FILE_UNIX_BASIC);
+	if (result < 0)
+		goto out_free;
+
+	smb_decode_unix_basic(attr, server, req->rq_data);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
+		    struct smb_fattr *attr)
+{
+	struct inode *inode = dir->d_inode;
+	int result;
+
+	/* FIXME: why not use the "all" version? */
+	result = smb_proc_getattr_trans2_std(server, dir, attr);
+	if (result < 0)
+		goto out;
+
+	/*
+	 * None of the getattr versions here can make win9x return the right
+	 * filesize if there are changes made to an open file.
+	 * A seek-to-end does return the right size, but we only need to do
+	 * that on files we have written.
+	 */
+	if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
+	    smb_is_open(inode))
+	{
+		__u16 fileid = SMB_I(inode)->fileid;
+		attr->f_size = smb_proc_seek(server, fileid, 2, 0);
+	}
+
+out:
+	return result;
+}
+
+static int
+smb_proc_ops_wait(struct smb_sb_info *server)
+{
+	int result;
+
+	result = wait_event_interruptible_timeout(server->conn_wq,
+				server->conn_complete, 30*HZ);
+
+	if (!result || signal_pending(current))
+		return -EIO;
+
+	return 0;
+}
+
+static int
+smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
+			  struct smb_fattr *fattr)
+{
+	int result;
+
+	if (smb_proc_ops_wait(server) < 0)
+		return -EIO;
+
+	smb_init_dirent(server, fattr);
+	result = server->ops->getattr(server, dir, fattr);
+	smb_finish_dirent(server, fattr);
+
+	return result;
+}
+
+static int
+smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
+		      struct smb_cache_control *ctl)
+{
+	struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
+
+	if (smb_proc_ops_wait(server) < 0)
+		return -EIO;
+
+	return server->ops->readdir(filp, dirent, filldir, ctl);
+}
+
+int
+smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = server_from_dentry(dir);
+	int result;
+
+	smb_init_dirent(server, fattr);
+	result = server->ops->getattr(server, dir, fattr);
+	smb_finish_dirent(server, fattr);
+
+	return result;
+}
+
+
+/*
+ * Because of bugs in the core protocol, we use this only to set
+ * attributes. See smb_proc_settime() below for timestamp handling.
+ *
+ * Bugs Noted:
+ * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
+ * with an undocumented error (ERRDOS code 50). Setting
+ * mtime to 0 allows the attributes to be set.
+ * (2) The extra parameters following the name string aren't
+ * in the CIFS docs, but seem to be necessary for operation.
+ */
+static int
+smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
+		      __u16 attr)
+{
+	char *p;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+
+	p = smb_setup_header(req, SMBsetatr, 8, 0);
+	WSET(req->rq_header, smb_vwv0, attr);
+	DSET(req->rq_header, smb_vwv1, 0); /* mtime */
+	WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
+	WSET(req->rq_header, smb_vwv4, 0);
+	WSET(req->rq_header, smb_vwv5, 0);
+	WSET(req->rq_header, smb_vwv6, 0);
+	WSET(req->rq_header, smb_vwv7, 0);
+	result = smb_simple_encode_path(req, &p, dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
+		result = -ENAMETOOLONG;
+		goto out_free;
+	}
+	*p++ = 4;
+	*p++ = 0;
+	smb_setup_bcc(req, p);
+
+	result = smb_request_ok(req, SMBsetatr, 0, 0);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Because of bugs in the trans2 setattr messages, we must set
+ * attributes and timestamps separately. The core SMBsetatr
+ * message seems to be the only reliable way to set attributes.
+ */
+int
+smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = server_from_dentry(dir);
+	int result;
+
+	VERBOSE("setting %s/%s, open=%d\n", 
+		DENTRY_PATH(dir), smb_is_open(dir->d_inode));
+	result = smb_proc_setattr_core(server, dir, fattr->attr);
+	return result;
+}
+
+/*
+ * Sets the timestamps for an file open with write permissions.
+ */
+static int
+smb_proc_setattr_ext(struct smb_sb_info *server,
+		      struct inode *inode, struct smb_fattr *fattr)
+{
+	__u16 date, time;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBsetattrE, 7, 0);
+	WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
+	/* We don't change the creation time */
+	WSET(req->rq_header, smb_vwv1, 0);
+	WSET(req->rq_header, smb_vwv2, 0);
+	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
+	WSET(req->rq_header, smb_vwv3, date);
+	WSET(req->rq_header, smb_vwv4, time);
+	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
+	WSET(req->rq_header, smb_vwv5, date);
+	WSET(req->rq_header, smb_vwv6, time);
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
+	       date, time, fattr->f_mtime);
+#endif
+
+	req->rq_flags |= SMB_REQ_NORETRY;
+	result = smb_request_ok(req, SMBsetattrE, 0, 0);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Bugs Noted:
+ * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
+ * set the file's attribute flags.
+ */
+static int
+smb_proc_setattr_trans2(struct smb_sb_info *server,
+			struct dentry *dir, struct smb_fattr *fattr)
+{
+	__u16 date, time;
+	char *p, *param;
+	int result;
+	char data[26];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, 1);	/* Info level SMB_INFO_STANDARD */
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	WSET(data, 0, 0); /* creation time */
+	WSET(data, 2, 0);
+	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
+	WSET(data, 4, date);
+	WSET(data, 6, time);
+	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
+	WSET(data, 8, date);
+	WSET(data, 10, time);
+#ifdef SMBFS_DEBUG_TIMESTAMP
+	printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
+	       DENTRY_PATH(dir), date, time, fattr->f_mtime);
+#endif
+	DSET(data, 12, 0); /* size */
+	DSET(data, 16, 0); /* blksize */
+	WSET(data, 20, 0); /* attr */
+	DSET(data, 22, 0); /* ULONG EA size */
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_ldata = 26;
+	req->rq_data  = data;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+	result = 0;
+	if (req->rq_rcls != 0)
+		result = smb_errno(req);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * ATTR_MODE      0x001
+ * ATTR_UID       0x002
+ * ATTR_GID       0x004
+ * ATTR_SIZE      0x008
+ * ATTR_ATIME     0x010
+ * ATTR_MTIME     0x020
+ * ATTR_CTIME     0x040
+ * ATTR_ATIME_SET 0x080
+ * ATTR_MTIME_SET 0x100
+ * ATTR_FORCE     0x200	
+ * ATTR_ATTR_FLAG 0x400
+ *
+ * major/minor should only be set by mknod.
+ */
+int
+smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
+		      unsigned int major, unsigned int minor)
+{
+	struct smb_sb_info *server = server_from_dentry(d);
+	u64 nttime;
+	char *p, *param;
+	int result;
+	char data[100];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
+
+	WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	/* 0 L file size in bytes */
+	/* 8 L file size on disk in bytes (block count) */
+	/* 40 L uid */
+	/* 48 L gid */
+	/* 56 W file type enum */
+	/* 60 L devmajor */
+	/* 68 L devminor */
+	/* 76 L unique ID (inode) */
+	/* 84 L permissions */
+	/* 92 L link count */
+	LSET(data, 0, SMB_SIZE_NO_CHANGE);
+	LSET(data, 8, SMB_SIZE_NO_CHANGE);
+	LSET(data, 16, SMB_TIME_NO_CHANGE);
+	LSET(data, 24, SMB_TIME_NO_CHANGE);
+	LSET(data, 32, SMB_TIME_NO_CHANGE);
+	LSET(data, 40, SMB_UID_NO_CHANGE);
+	LSET(data, 48, SMB_GID_NO_CHANGE);
+	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
+	LSET(data, 60, major);
+	LSET(data, 68, minor);
+	LSET(data, 76, 0);
+	LSET(data, 84, SMB_MODE_NO_CHANGE);
+	LSET(data, 92, 0);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		LSET(data, 0, attr->ia_size);
+		LSET(data, 8, 0); /* can't set anyway */
+	}
+
+	/*
+	 * FIXME: check the conversion function it the correct one
+	 *
+	 * we can't set ctime but we might as well pass this to the server
+	 * and let it ignore it.
+	 */
+	if (attr->ia_valid & ATTR_CTIME) {
+		nttime = smb_unixutc2ntutc(attr->ia_ctime);
+		LSET(data, 16, nttime);
+	}
+	if (attr->ia_valid & ATTR_ATIME) {
+		nttime = smb_unixutc2ntutc(attr->ia_atime);
+		LSET(data, 24, nttime);
+	}
+	if (attr->ia_valid & ATTR_MTIME) {
+		nttime = smb_unixutc2ntutc(attr->ia_mtime);
+		LSET(data, 32, nttime);
+	}
+	
+	if (attr->ia_valid & ATTR_UID) {
+		LSET(data, 40, attr->ia_uid);
+	}
+	if (attr->ia_valid & ATTR_GID) {
+		LSET(data, 48, attr->ia_gid); 
+	}
+	
+	if (attr->ia_valid & ATTR_MODE) {
+		LSET(data, 84, attr->ia_mode);
+	}
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_ldata = 100;
+	req->rq_data  = data;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+
+/*
+ * Set the modify and access timestamps for a file.
+ *
+ * Incredibly enough, in all of SMB there is no message to allow
+ * setting both attributes and timestamps at once. 
+ *
+ * Bugs Noted:
+ * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
+ * with info level 1 (INFO_STANDARD).
+ * (2) Win 95 seems not to support setting directory timestamps.
+ * (3) Under the core protocol apparently the only way to set the
+ * timestamp is to open and close the file.
+ */
+int
+smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
+{
+	struct smb_sb_info *server = server_from_dentry(dentry);
+	struct inode *inode = dentry->d_inode;
+	int result;
+
+	VERBOSE("setting %s/%s, open=%d\n",
+		DENTRY_PATH(dentry), smb_is_open(inode));
+
+	/* setting the time on a Win95 server fails (tridge) */
+	if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
+	    !(server->mnt->flags & SMB_MOUNT_WIN95)) {
+		if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
+			result = smb_proc_setattr_ext(server, inode, fattr);
+		else
+			result = smb_proc_setattr_trans2(server, dentry, fattr);
+	} else {
+		/*
+		 * Fail silently on directories ... timestamp can't be set?
+		 */
+		result = 0;
+		if (S_ISREG(inode->i_mode)) {
+			/*
+			 * Set the mtime by opening and closing the file.
+			 * Note that the file is opened read-only, but this
+			 * still allows us to set the date (tridge)
+			 */
+			result = -EACCES;
+			if (!smb_is_open(inode))
+				smb_proc_open(server, dentry, SMB_O_RDONLY);
+			if (smb_is_open(inode)) {
+				inode->i_mtime = fattr->f_mtime;
+				result = smb_proc_close_inode(server, inode);
+			}
+		}
+	}
+
+	return result;
+}
+
+int
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
+{
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
+	int result;
+	char *p;
+	long unit;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 0)))
+		goto out;
+
+	smb_setup_header(req, SMBdskattr, 0, 0);
+	if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
+		goto out_free;
+	p = SMB_VWV(req->rq_header);
+	unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
+	attr->f_blocks = WVAL(p, 0) * unit;
+	attr->f_bsize  = SMB_ST_BLKSIZE;
+	attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+int
+smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
+		   char *buffer, int len)
+{
+	char *p, *param;
+	int result;
+	struct smb_request *req;
+
+	DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	req->rq_trans2_command = TRANSACT2_QPATHINFO;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
+		&param[6], result, req->rq_rcls, req->rq_err);
+
+	/* copy data up to the \0 or buffer length */
+	result = len;
+	if (req->rq_ldata < len)
+		result = req->rq_ldata;
+	strncpy(buffer, req->rq_data, result);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+
+/*
+ * Create a symlink object called dentry which points to oldpath.
+ * Samba does not permit dangling links but returns a suitable error message.
+ */
+int
+smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
+		 const char *oldpath)
+{
+	char *p, *param;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_ldata = strlen(oldpath) + 1;
+	req->rq_data  = (char *) oldpath;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
+		&param[6], result, req->rq_rcls, req->rq_err);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+/*
+ * Create a hard link object called new_dentry which points to dentry.
+ */
+int
+smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
+	      struct dentry *new_dentry)
+{
+	char *p, *param;
+	int result;
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
+		goto out;
+	param = req->rq_buffer;
+
+	WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
+	DSET(param, 2, 0);
+	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
+				 new_dentry, NULL);
+	if (result < 0)
+		goto out_free;
+	p = param + 6 + result;
+
+	/* Grr, pointless separation of parameters and data ... */
+	req->rq_data = p;
+	req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
+					dentry, NULL);
+
+	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
+	req->rq_lparm = p - param;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
+	       &param[6], result, req->rq_rcls, req->rq_err);
+	result = 0;
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+static int
+smb_proc_query_cifsunix(struct smb_sb_info *server)
+{
+	int result;
+	int major, minor;
+	u64 caps;
+	char param[2];
+	struct smb_request *req;
+
+	result = -ENOMEM;
+	if (! (req = smb_alloc_request(server, 100)))
+		goto out;
+
+	WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
+
+	req->rq_trans2_command = TRANSACT2_QFSINFO;
+	req->rq_ldata = 0;
+	req->rq_data  = NULL;
+	req->rq_lparm = 2;
+	req->rq_parm  = param;
+	req->rq_flags = 0;
+	result = smb_add_request(req);
+	if (result < 0)
+		goto out_free;
+
+	if (req->rq_ldata < 12) {
+		PARANOIA("Not enough data\n");
+		goto out_free;
+	}
+	major = WVAL(req->rq_data, 0);
+	minor = WVAL(req->rq_data, 2);
+
+	DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
+	       major, minor);
+	/* FIXME: verify that we are ok with this major/minor? */
+
+	caps = LVAL(req->rq_data, 4);
+	DEBUG1("Server capabilities 0x%016llx\n", caps);
+
+out_free:
+	smb_rput(req);
+out:
+	return result;
+}
+
+
+static void
+install_ops(struct smb_ops *dst, struct smb_ops *src)
+{
+	memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
+}
+
+/* < LANMAN2 */
+static struct smb_ops smb_ops_core =
+{
+	.read		= smb_proc_read,
+	.write		= smb_proc_write,
+	.readdir	= smb_proc_readdir_short,
+	.getattr	= smb_proc_getattr_core,
+	.truncate	= smb_proc_trunc32,
+};
+
+/* LANMAN2, OS/2, others? */
+static struct smb_ops smb_ops_os2 =
+{
+	.read		= smb_proc_read,
+	.write		= smb_proc_write,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_trans2_std,
+	.truncate	= smb_proc_trunc32,
+};
+
+/* Win95, and possibly some NetApp versions too */
+static struct smb_ops smb_ops_win95 =
+{
+	.read		= smb_proc_read,    /* does not support 12word readX */
+	.write		= smb_proc_write,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_95,
+	.truncate	= smb_proc_trunc95,
+};
+
+/* Samba, NT4 and NT5 */
+static struct smb_ops smb_ops_winNT =
+{
+	.read		= smb_proc_readX,
+	.write		= smb_proc_writeX,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_trans2_all,
+	.truncate	= smb_proc_trunc64,
+};
+
+/* Samba w/ unix extensions. Others? */
+static struct smb_ops smb_ops_unix =
+{
+	.read		= smb_proc_readX,
+	.write		= smb_proc_writeX,
+	.readdir	= smb_proc_readdir_long,
+	.getattr	= smb_proc_getattr_unix,
+	/* FIXME: core/ext/time setattr needs to be cleaned up! */
+	/* .setattr	= smb_proc_setattr_unix, */
+	.truncate	= smb_proc_trunc64,
+};
+
+/* Place holder until real ops are in place */
+static struct smb_ops smb_ops_null =
+{
+	.readdir	= smb_proc_readdir_null,
+	.getattr	= smb_proc_getattr_null,
+};
+
+void smb_install_null_ops(struct smb_ops *ops)
+{
+	install_ops(ops, &smb_ops_null);
+}
diff --git a/drivers/staging/smbfs/proto.h b/drivers/staging/smbfs/proto.h
new file mode 100644
index 000000000000..05939a6f43e6
--- /dev/null
+++ b/drivers/staging/smbfs/proto.h
@@ -0,0 +1,87 @@
+/*
+ *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
+ */
+
+struct smb_request;
+struct sock;
+struct statfs;
+
+/* proc.c */
+extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
+extern __u32 smb_len(__u8 *p);
+extern int smb_get_rsize(struct smb_sb_info *server);
+extern int smb_get_wsize(struct smb_sb_info *server);
+extern int smb_errno(struct smb_request *req);
+extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
+extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
+extern int smb_open(struct dentry *dentry, int wish);
+extern int smb_close(struct inode *ino);
+extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
+extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
+extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
+extern int smb_proc_mkdir(struct dentry *dentry);
+extern int smb_proc_rmdir(struct dentry *dentry);
+extern int smb_proc_unlink(struct dentry *dentry);
+extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
+extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
+				 struct super_block *sb);
+extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
+extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
+extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
+extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
+extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
+extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
+extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
+extern void smb_install_null_ops(struct smb_ops *ops);
+/* dir.c */
+extern const struct file_operations smb_dir_operations;
+extern const struct inode_operations smb_dir_inode_operations;
+extern const struct inode_operations smb_dir_inode_operations_unix;
+extern void smb_new_dentry(struct dentry *dentry);
+extern void smb_renew_times(struct dentry *dentry);
+/* cache.c */
+extern void smb_invalid_dir_cache(struct inode *dir);
+extern void smb_invalidate_dircache_entries(struct dentry *parent);
+extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
+extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
+/* sock.c */
+extern void smb_data_ready(struct sock *sk, int len);
+extern int smb_valid_socket(struct inode *inode);
+extern void smb_close_socket(struct smb_sb_info *server);
+extern int smb_recv_available(struct smb_sb_info *server);
+extern int smb_receive_header(struct smb_sb_info *server);
+extern int smb_receive_drop(struct smb_sb_info *server);
+extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
+extern int smb_send_request(struct smb_request *req);
+/* inode.c */
+extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
+extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
+extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
+extern void smb_invalidate_inodes(struct smb_sb_info *server);
+extern int smb_revalidate_inode(struct dentry *dentry);
+extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
+/* file.c */
+extern const struct address_space_operations smb_file_aops;
+extern const struct file_operations smb_file_operations;
+extern const struct inode_operations smb_file_inode_operations;
+/* ioctl.c */
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+/* smbiod.c */
+extern void smbiod_wake_up(void);
+extern int smbiod_register_server(struct smb_sb_info *server);
+extern void smbiod_unregister_server(struct smb_sb_info *server);
+extern void smbiod_flush(struct smb_sb_info *server);
+extern int smbiod_retry(struct smb_sb_info *server);
+/* request.c */
+extern int smb_init_request_cache(void);
+extern void smb_destroy_request_cache(void);
+extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
+extern void smb_rput(struct smb_request *req);
+extern int smb_add_request(struct smb_request *req);
+extern int smb_request_send_server(struct smb_sb_info *server);
+extern int smb_request_recv(struct smb_sb_info *server);
+/* symlink.c */
+extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
+extern const struct inode_operations smb_link_inode_operations;
diff --git a/drivers/staging/smbfs/request.c b/drivers/staging/smbfs/request.c
new file mode 100644
index 000000000000..3e7716864306
--- /dev/null
+++ b/drivers/staging/smbfs/request.c
@@ -0,0 +1,817 @@
+/*
+ *  request.c
+ *
+ *  Copyright (C) 2001 by Urban Widmark
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/sched.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "request.h"
+#include "proto.h"
+
+/* #define SMB_SLAB_DEBUG	(SLAB_RED_ZONE | SLAB_POISON) */
+#define SMB_SLAB_DEBUG	0
+
+/* cache for request structures */
+static struct kmem_cache *req_cachep;
+
+static int smb_request_send_req(struct smb_request *req);
+
+/*
+  /proc/slabinfo:
+  name, active, num, objsize, active_slabs, num_slaps, #pages
+*/
+
+
+int smb_init_request_cache(void)
+{
+	req_cachep = kmem_cache_create("smb_request",
+				       sizeof(struct smb_request), 0,
+				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
+				       NULL);
+	if (req_cachep == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void smb_destroy_request_cache(void)
+{
+	kmem_cache_destroy(req_cachep);
+}
+
+/*
+ * Allocate and initialise a request structure
+ */
+static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
+						int bufsize)
+{
+	struct smb_request *req;
+	unsigned char *buf = NULL;
+
+	req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
+	VERBOSE("allocating request: %p\n", req);
+	if (!req)
+		goto out;
+
+	if (bufsize > 0) {
+		buf = kmalloc(bufsize, GFP_NOFS);
+		if (!buf) {
+			kmem_cache_free(req_cachep, req);
+			return NULL;
+		}
+	}
+
+	req->rq_buffer = buf;
+	req->rq_bufsize = bufsize;
+	req->rq_server = server;
+	init_waitqueue_head(&req->rq_wait);
+	INIT_LIST_HEAD(&req->rq_queue);
+	atomic_set(&req->rq_count, 1);
+
+out:
+	return req;
+}
+
+struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
+{
+	struct smb_request *req = NULL;
+
+	for (;;) {
+		atomic_inc(&server->nr_requests);
+		if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
+			req = smb_do_alloc_request(server, bufsize);
+			if (req != NULL)
+				break;
+		}
+
+#if 0
+		/*
+		 * Try to free up at least one request in order to stay
+		 * below the hard limit
+		 */
+                if (nfs_try_to_free_pages(server))
+			continue;
+
+		if (fatal_signal_pending(current))
+			return ERR_PTR(-ERESTARTSYS);
+		current->policy = SCHED_YIELD;
+		schedule();
+#else
+		/* FIXME: we want something like nfs does above, but that
+		   requires changes to all callers and can wait. */
+		break;
+#endif
+	}
+	return req;
+}
+
+static void smb_free_request(struct smb_request *req)
+{
+	atomic_dec(&req->rq_server->nr_requests);
+	if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
+		kfree(req->rq_buffer);
+	kfree(req->rq_trans2buffer);
+	kmem_cache_free(req_cachep, req);
+}
+
+/*
+ * What prevents a rget to race with a rput? The count must never drop to zero
+ * while it is in use. Only rput if it is ok that it is free'd.
+ */
+static void smb_rget(struct smb_request *req)
+{
+	atomic_inc(&req->rq_count);
+}
+void smb_rput(struct smb_request *req)
+{
+	if (atomic_dec_and_test(&req->rq_count)) {
+		list_del_init(&req->rq_queue);
+		smb_free_request(req);
+	}
+}
+
+/* setup to receive the data part of the SMB */
+static int smb_setup_bcc(struct smb_request *req)
+{
+	int result = 0;
+	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
+
+	if (req->rq_rlen > req->rq_bufsize) {
+		PARANOIA("Packet too large %d > %d\n",
+			 req->rq_rlen, req->rq_bufsize);
+		return -ENOBUFS;
+	}
+
+	req->rq_iov[0].iov_base = req->rq_buffer;
+	req->rq_iov[0].iov_len  = req->rq_rlen;
+	req->rq_iovlen = 1;
+
+	return result;
+}
+
+/*
+ * Prepare a "normal" request structure.
+ */
+static int smb_setup_request(struct smb_request *req)
+{
+	int len = smb_len(req->rq_header) + 4;
+	req->rq_slen = len;
+
+	/* if we expect a data part in the reply we set the iov's to read it */
+	if (req->rq_resp_bcc)
+		req->rq_setup_read = smb_setup_bcc;
+
+	/* This tries to support re-using the same request */
+	req->rq_bytes_sent = 0;
+	req->rq_rcls = 0;
+	req->rq_err = 0;
+	req->rq_errno = 0;
+	req->rq_fragment = 0;
+	kfree(req->rq_trans2buffer);
+	req->rq_trans2buffer = NULL;
+
+	return 0;
+}
+
+/*
+ * Prepare a transaction2 request structure
+ */
+static int smb_setup_trans2request(struct smb_request *req)
+{
+	struct smb_sb_info *server = req->rq_server;
+	int mparam, mdata;
+	static unsigned char padding[4];
+
+	/* I know the following is very ugly, but I want to build the
+	   smb packet as efficiently as possible. */
+
+	const int smb_parameters = 15;
+	const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
+	const int oparam = ALIGN(header + 3, sizeof(u32));
+	const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
+	const int bcc = (req->rq_data ? odata + req->rq_ldata :
+					oparam + req->rq_lparm) - header;
+
+	if ((bcc + oparam) > server->opt.max_xmit)
+		return -ENOMEM;
+	smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
+
+	/*
+	 * max parameters + max data + max setup == bufsize to make NT4 happy
+	 * and not abort the transfer or split into multiple responses. It also
+	 * makes smbfs happy as handling packets larger than the buffer size
+	 * is extra work.
+	 *
+	 * OS/2 is probably going to hate me for this ...
+	 */
+	mparam = SMB_TRANS2_MAX_PARAM;
+	mdata = req->rq_bufsize - mparam;
+
+	mdata = server->opt.max_xmit - mparam - 100;
+	if (mdata < 1024) {
+		mdata = 1024;
+		mparam = 20;
+	}
+
+#if 0
+	/* NT/win2k has ~4k max_xmit, so with this we request more than it wants
+	   to return as one SMB. Useful for testing the fragmented trans2
+	   handling. */
+	mdata = 8192;
+#endif
+
+	WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
+	WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
+	WSET(req->rq_header, smb_mprcnt, mparam);
+	WSET(req->rq_header, smb_mdrcnt, mdata);
+	WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
+	WSET(req->rq_header, smb_flags, 0);
+	DSET(req->rq_header, smb_timeout, 0);
+	WSET(req->rq_header, smb_pscnt, req->rq_lparm);
+	WSET(req->rq_header, smb_psoff, oparam - 4);
+	WSET(req->rq_header, smb_dscnt, req->rq_ldata);
+	WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
+	*(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
+	*(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
+	WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
+
+	req->rq_iovlen = 2;
+	req->rq_iov[0].iov_base = (void *) req->rq_header;
+	req->rq_iov[0].iov_len = oparam;
+	req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
+	req->rq_iov[1].iov_len = req->rq_lparm;
+	req->rq_slen = oparam + req->rq_lparm;
+
+	if (req->rq_data) {
+		req->rq_iovlen += 2;
+		req->rq_iov[2].iov_base = padding;
+		req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
+		req->rq_iov[3].iov_base = req->rq_data;
+		req->rq_iov[3].iov_len = req->rq_ldata;
+		req->rq_slen = odata + req->rq_ldata;
+	}
+
+	/* always a data part for trans2 replies */
+	req->rq_setup_read = smb_setup_bcc;
+
+	return 0;
+}
+
+/*
+ * Add a request and tell smbiod to process it
+ */
+int smb_add_request(struct smb_request *req)
+{
+	long timeleft;
+	struct smb_sb_info *server = req->rq_server;
+	int result = 0;
+
+	smb_setup_request(req);
+	if (req->rq_trans2_command) {
+		if (req->rq_buffer == NULL) {
+			PARANOIA("trans2 attempted without response buffer!\n");
+			return -EIO;
+		}
+		result = smb_setup_trans2request(req);
+	}
+	if (result < 0)
+		return result;
+
+#ifdef SMB_DEBUG_PACKET_SIZE
+	add_xmit_stats(req);
+#endif
+
+	/* add 'req' to the queue of requests */
+	if (smb_lock_server_interruptible(server))
+		return -EINTR;
+
+	/*
+	 * Try to send the request as the process. If that fails we queue the
+	 * request and let smbiod send it later.
+	 */
+
+	/* FIXME: each server has a number on the maximum number of parallel
+	   requests. 10, 50 or so. We should not allow more requests to be
+	   active. */
+	if (server->mid > 0xf000)
+		server->mid = 0;
+	req->rq_mid = server->mid++;
+	WSET(req->rq_header, smb_mid, req->rq_mid);
+
+	result = 0;
+	if (server->state == CONN_VALID) {
+		if (list_empty(&server->xmitq))
+			result = smb_request_send_req(req);
+		if (result < 0) {
+			/* Connection lost? */
+			server->conn_error = result;
+			server->state = CONN_INVALID;
+		}
+	}
+	if (result != 1)
+		list_add_tail(&req->rq_queue, &server->xmitq);
+	smb_rget(req);
+
+	if (server->state != CONN_VALID)
+		smbiod_retry(server);
+
+	smb_unlock_server(server);
+
+	smbiod_wake_up();
+
+	timeleft = wait_event_interruptible_timeout(req->rq_wait,
+				    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
+	if (!timeleft || signal_pending(current)) {
+		/*
+		 * On timeout or on interrupt we want to try and remove the
+		 * request from the recvq/xmitq.
+		 * First check if the request is still part of a queue. (May
+		 * have been removed by some error condition)
+		 */
+		smb_lock_server(server);
+		if (!list_empty(&req->rq_queue)) {
+			list_del_init(&req->rq_queue);
+			smb_rput(req);
+		}
+		smb_unlock_server(server);
+	}
+
+	if (!timeleft) {
+		PARANOIA("request [%p, mid=%d] timed out!\n",
+			 req, req->rq_mid);
+		VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
+		VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
+		VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
+		VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
+		VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
+		VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
+		VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
+		VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
+
+		req->rq_rcls = ERRSRV;
+		req->rq_err  = ERRtimeout;
+
+		/* Just in case it was "stuck" */
+		smbiod_wake_up();
+	}
+	VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
+
+	if (req->rq_rcls != 0)
+		req->rq_errno = smb_errno(req);
+	if (signal_pending(current))
+		req->rq_errno = -ERESTARTSYS;
+	return req->rq_errno;
+}
+
+/*
+ * Send a request and place it on the recvq if successfully sent.
+ * Must be called with the server lock held.
+ */
+static int smb_request_send_req(struct smb_request *req)
+{
+	struct smb_sb_info *server = req->rq_server;
+	int result;
+
+	if (req->rq_bytes_sent == 0) {
+		WSET(req->rq_header, smb_tid, server->opt.tid);
+		WSET(req->rq_header, smb_pid, 1);
+		WSET(req->rq_header, smb_uid, server->opt.server_uid);
+	}
+
+	result = smb_send_request(req);
+	if (result < 0 && result != -EAGAIN)
+		goto out;
+
+	result = 0;
+	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
+		goto out;
+
+	list_move_tail(&req->rq_queue, &server->recvq);
+	result = 1;
+out:
+	return result;
+}
+
+/*
+ * Sends one request for this server. (smbiod)
+ * Must be called with the server lock held.
+ * Returns: <0 on error
+ *           0 if no request could be completely sent
+ *           1 if all data for one request was sent
+ */
+int smb_request_send_server(struct smb_sb_info *server)
+{
+	struct list_head *head;
+	struct smb_request *req;
+	int result;
+
+	if (server->state != CONN_VALID)
+		return 0;
+
+	/* dequeue first request, if any */
+	req = NULL;
+	head = server->xmitq.next;
+	if (head != &server->xmitq) {
+		req = list_entry(head, struct smb_request, rq_queue);
+	}
+	if (!req)
+		return 0;
+
+	result = smb_request_send_req(req);
+	if (result < 0) {
+		server->conn_error = result;
+		list_move(&req->rq_queue, &server->xmitq);
+		result = -EIO;
+		goto out;
+	}
+
+out:
+	return result;
+}
+
+/*
+ * Try to find a request matching this "mid". Typically the first entry will
+ * be the matching one.
+ */
+static struct smb_request *find_request(struct smb_sb_info *server, int mid)
+{
+	struct list_head *tmp;
+	struct smb_request *req = NULL;
+
+	list_for_each(tmp, &server->recvq) {
+		req = list_entry(tmp, struct smb_request, rq_queue);
+		if (req->rq_mid == mid) {
+			break;
+		}
+		req = NULL;
+	}
+
+	if (!req) {
+		VERBOSE("received reply with mid %d but no request!\n",
+			WVAL(server->header, smb_mid));
+		server->rstate = SMB_RECV_DROP;
+	}
+
+	return req;
+}
+
+/*
+ * Called when we have read the smb header and believe this is a response.
+ */
+static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
+{
+	int hdrlen, wct;
+
+	memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
+
+	wct = *(req->rq_header + smb_wct);
+	if (wct > 20) {	
+		PARANOIA("wct too large, %d > 20\n", wct);
+		server->rstate = SMB_RECV_DROP;
+		return 0;
+	}
+
+	req->rq_resp_wct = wct;
+	hdrlen = SMB_HEADER_LEN + wct*2 + 2;
+	VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
+
+	req->rq_bytes_recvd = SMB_HEADER_LEN;
+	req->rq_rlen = hdrlen;
+	req->rq_iov[0].iov_base = req->rq_header;
+	req->rq_iov[0].iov_len  = hdrlen;
+	req->rq_iovlen = 1;
+	server->rstate = SMB_RECV_PARAM;
+
+#ifdef SMB_DEBUG_PACKET_SIZE
+	add_recv_stats(smb_len(server->header));
+#endif
+	return 0;
+}
+
+/*
+ * Reads the SMB parameters
+ */
+static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
+{
+	int result;
+
+	result = smb_receive(server, req);
+	if (result < 0)
+		return result;
+	if (req->rq_bytes_recvd < req->rq_rlen)
+		return 0;
+
+	VERBOSE("result: %d   smb_bcc:  %04x\n", result,
+		WVAL(req->rq_header, SMB_HEADER_LEN +
+		     (*(req->rq_header + smb_wct) * 2)));
+
+	result = 0;
+	req->rq_iov[0].iov_base = NULL;
+	req->rq_rlen = 0;
+	if (req->rq_callback)
+		req->rq_callback(req);
+	else if (req->rq_setup_read)
+		result = req->rq_setup_read(req);
+	if (result < 0) {
+		server->rstate = SMB_RECV_DROP;
+		return result;
+	}
+
+	server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
+
+	req->rq_bytes_recvd = 0;	// recvd out of the iov
+
+	VERBOSE("rlen: %d\n", req->rq_rlen);
+	if (req->rq_rlen < 0) {
+		PARANOIA("Parameters read beyond end of packet!\n");
+		server->rstate = SMB_RECV_END;
+		return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * Reads the SMB data
+ */
+static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
+{
+	int result;
+
+	result = smb_receive(server, req);
+	if (result < 0)
+		goto out;
+	if (req->rq_bytes_recvd < req->rq_rlen)
+		goto out;
+	server->rstate = SMB_RECV_END;
+out:
+	VERBOSE("result: %d\n", result);
+	return result;
+}
+
+/*
+ * Receive a transaction2 response
+ * Return: 0 if the response has been fully read
+ *         1 if there are further "fragments" to read
+ *        <0 if there is an error
+ */
+static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
+{
+	unsigned char *inbuf;
+	unsigned int parm_disp, parm_offset, parm_count, parm_tot;
+	unsigned int data_disp, data_offset, data_count, data_tot;
+	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
+
+	VERBOSE("handling trans2\n");
+
+	inbuf = req->rq_header;
+	data_tot    = WVAL(inbuf, smb_tdrcnt);
+	parm_tot    = WVAL(inbuf, smb_tprcnt);
+	parm_disp   = WVAL(inbuf, smb_prdisp);
+	parm_offset = WVAL(inbuf, smb_proff);
+	parm_count  = WVAL(inbuf, smb_prcnt);
+	data_disp   = WVAL(inbuf, smb_drdisp);
+	data_offset = WVAL(inbuf, smb_droff);
+	data_count  = WVAL(inbuf, smb_drcnt);
+
+	/* Modify offset for the split header/buffer we use */
+	if (data_count || data_offset) {
+		if (unlikely(data_offset < hdrlen))
+			goto out_bad_data;
+		else
+			data_offset -= hdrlen;
+	}
+	if (parm_count || parm_offset) {
+		if (unlikely(parm_offset < hdrlen))
+			goto out_bad_parm;
+		else
+			parm_offset -= hdrlen;
+	}
+
+	if (parm_count == parm_tot && data_count == data_tot) {
+		/*
+		 * This packet has all the trans2 data.
+		 *
+		 * We setup the request so that this will be the common
+		 * case. It may be a server error to not return a
+		 * response that fits.
+		 */
+		VERBOSE("single trans2 response  "
+			"dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
+			data_count, parm_count,
+			data_offset, parm_offset);
+		req->rq_ldata = data_count;
+		req->rq_lparm = parm_count;
+		req->rq_data = req->rq_buffer + data_offset;
+		req->rq_parm = req->rq_buffer + parm_offset;
+		if (unlikely(parm_offset + parm_count > req->rq_rlen))
+			goto out_bad_parm;
+		if (unlikely(data_offset + data_count > req->rq_rlen))
+			goto out_bad_data;
+		return 0;
+	}
+
+	VERBOSE("multi trans2 response  "
+		"frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
+		req->rq_fragment,
+		data_count, parm_count,
+		data_offset, parm_offset);
+
+	if (!req->rq_fragment) {
+		int buf_len;
+
+		/* We got the first trans2 fragment */
+		req->rq_fragment = 1;
+		req->rq_total_data = data_tot;
+		req->rq_total_parm = parm_tot;
+		req->rq_ldata = 0;
+		req->rq_lparm = 0;
+
+		buf_len = data_tot + parm_tot;
+		if (buf_len > SMB_MAX_PACKET_SIZE)
+			goto out_too_long;
+
+		req->rq_trans2bufsize = buf_len;
+		req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
+		if (!req->rq_trans2buffer)
+			goto out_no_mem;
+
+		req->rq_parm = req->rq_trans2buffer;
+		req->rq_data = req->rq_trans2buffer + parm_tot;
+	} else if (unlikely(req->rq_total_data < data_tot ||
+			    req->rq_total_parm < parm_tot))
+		goto out_data_grew;
+
+	if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
+		     parm_offset + parm_count > req->rq_rlen))
+		goto out_bad_parm;
+	if (unlikely(data_disp + data_count > req->rq_total_data ||
+		     data_offset + data_count > req->rq_rlen))
+		goto out_bad_data;
+
+	inbuf = req->rq_buffer;
+	memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
+	memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
+
+	req->rq_ldata += data_count;
+	req->rq_lparm += parm_count;
+
+	/*
+	 * Check whether we've received all of the data. Note that
+	 * we use the packet totals -- total lengths might shrink!
+	 */
+	if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
+		req->rq_ldata = data_tot;
+		req->rq_lparm = parm_tot;
+		return 0;
+	}
+	return 1;
+
+out_too_long:
+	printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
+		data_tot, parm_tot);
+	goto out_EIO;
+out_no_mem:
+	printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
+	       req->rq_trans2bufsize);
+	req->rq_errno = -ENOMEM;
+	goto out;
+out_data_grew:
+	printk(KERN_ERR "smb_trans2: data/params grew!\n");
+	goto out_EIO;
+out_bad_parm:
+	printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
+	       parm_disp, parm_count, parm_tot, parm_offset);
+	goto out_EIO;
+out_bad_data:
+	printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
+	       data_disp, data_count, data_tot, data_offset);
+out_EIO:
+	req->rq_errno = -EIO;
+out:
+	return req->rq_errno;
+}
+
+/*
+ * State machine for receiving responses. We handle the fact that we can't
+ * read the full response in one try by having states telling us how much we
+ * have read.
+ *
+ * Must be called with the server lock held (only called from smbiod).
+ *
+ * Return: <0 on error
+ */
+int smb_request_recv(struct smb_sb_info *server)
+{
+	struct smb_request *req = NULL;
+	int result = 0;
+
+	if (smb_recv_available(server) <= 0)
+		return 0;
+
+	VERBOSE("state: %d\n", server->rstate);
+	switch (server->rstate) {
+	case SMB_RECV_DROP:
+		result = smb_receive_drop(server);
+		if (result < 0)
+			break;
+		if (server->rstate == SMB_RECV_DROP)
+			break;
+		server->rstate = SMB_RECV_START;
+		/* fallthrough */
+	case SMB_RECV_START:
+		server->smb_read = 0;
+		server->rstate = SMB_RECV_HEADER;
+		/* fallthrough */
+	case SMB_RECV_HEADER:
+		result = smb_receive_header(server);
+		if (result < 0)
+			break;
+		if (server->rstate == SMB_RECV_HEADER)
+			break;
+		if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
+			server->rstate = SMB_RECV_REQUEST;
+			break;
+		}
+		if (server->rstate != SMB_RECV_HCOMPLETE)
+			break;
+		/* fallthrough */
+	case SMB_RECV_HCOMPLETE:
+		req = find_request(server, WVAL(server->header, smb_mid));
+		if (!req)
+			break;
+		smb_init_request(server, req);
+		req->rq_rcls = *(req->rq_header + smb_rcls);
+		req->rq_err  = WVAL(req->rq_header, smb_err);
+		if (server->rstate != SMB_RECV_PARAM)
+			break;
+		/* fallthrough */
+	case SMB_RECV_PARAM:
+		if (!req)
+			req = find_request(server,WVAL(server->header,smb_mid));
+		if (!req)
+			break;
+		result = smb_recv_param(server, req);
+		if (result < 0)
+			break;
+		if (server->rstate != SMB_RECV_DATA)
+			break;
+		/* fallthrough */
+	case SMB_RECV_DATA:
+		if (!req)
+			req = find_request(server,WVAL(server->header,smb_mid));
+		if (!req)
+			break;
+		result = smb_recv_data(server, req);
+		if (result < 0)
+			break;
+		break;
+
+		/* We should never be called with any of these states */
+	case SMB_RECV_END:
+	case SMB_RECV_REQUEST:
+		BUG();
+	}
+
+	if (result < 0) {
+		/* We saw an error */
+		return result;
+	}
+
+	if (server->rstate != SMB_RECV_END)
+		return 0;
+
+	result = 0;
+	if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
+		result = smb_recv_trans2(server, req);
+
+	/*
+	 * Response completely read. Drop any extra bytes sent by the server.
+	 * (Yes, servers sometimes add extra bytes to responses)
+	 */
+	VERBOSE("smb_len: %d   smb_read: %d\n",
+		server->smb_len, server->smb_read);
+	if (server->smb_read < server->smb_len)
+		smb_receive_drop(server);
+
+	server->rstate = SMB_RECV_START;
+
+	if (!result) {
+		list_del_init(&req->rq_queue);
+		req->rq_flags |= SMB_REQ_RECEIVED;
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+	return 0;
+}
diff --git a/drivers/staging/smbfs/request.h b/drivers/staging/smbfs/request.h
new file mode 100644
index 000000000000..efb21451e7c9
--- /dev/null
+++ b/drivers/staging/smbfs/request.h
@@ -0,0 +1,70 @@
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/wait.h>
+
+struct smb_request {
+	struct list_head rq_queue;	/* recvq or xmitq for the server */
+
+	atomic_t rq_count;
+
+	wait_queue_head_t rq_wait;
+	int rq_flags;
+	int rq_mid;	/* multiplex ID, set by request.c */
+
+	struct smb_sb_info *rq_server;
+
+	/* header + word count + parameter words + byte count */
+	unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
+
+	int rq_bufsize;
+	unsigned char *rq_buffer;
+
+	/* FIXME: this is not good enough for merging IO requests. */
+	unsigned char *rq_page;
+	int rq_rsize;
+
+	int rq_resp_wct;
+	int rq_resp_bcc;
+
+	int rq_rlen;
+	int rq_bytes_recvd;
+
+	int rq_slen;
+	int rq_bytes_sent;
+
+	int rq_iovlen;
+	struct kvec rq_iov[4];
+
+	int (*rq_setup_read) (struct smb_request *);
+	void (*rq_callback) (struct smb_request *);
+
+	/* ------ trans2 stuff ------ */
+
+	u16 rq_trans2_command;	/* 0 if not a trans2 request */
+	unsigned int rq_ldata;
+	unsigned char *rq_data;
+	unsigned int rq_lparm;
+	unsigned char *rq_parm;
+
+	int rq_fragment;
+	u32 rq_total_data;
+	u32 rq_total_parm;
+	int rq_trans2bufsize;
+	unsigned char *rq_trans2buffer;
+
+	/* ------ response ------ */
+
+	unsigned short rq_rcls;
+	unsigned short rq_err;
+	int rq_errno;
+};
+
+#define SMB_REQ_STATIC		0x0001	/* rq_buffer is static */
+#define SMB_REQ_NORETRY		0x0002	/* request is invalid after retry */
+
+#define SMB_REQ_TRANSMITTED	0x4000	/* all data has been sent */
+#define SMB_REQ_RECEIVED	0x8000	/* reply received, smbiod is done */
+
+#define xSMB_REQ_NOREPLY	0x0004	/* we don't want the reply (if any) */
+#define xSMB_REQ_NORECEIVER	0x0008	/* caller doesn't wait for response */
diff --git a/drivers/staging/smbfs/smb.h b/drivers/staging/smbfs/smb.h
new file mode 100644
index 000000000000..82fefddc5987
--- /dev/null
+++ b/drivers/staging/smbfs/smb.h
@@ -0,0 +1,118 @@
+/*
+ *  smb.h
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_H
+#define _LINUX_SMB_H
+
+#include <linux/types.h>
+#include <linux/magic.h>
+#ifdef __KERNEL__
+#include <linux/time.h>
+#endif
+
+enum smb_protocol { 
+	SMB_PROTOCOL_NONE, 
+	SMB_PROTOCOL_CORE, 
+	SMB_PROTOCOL_COREPLUS, 
+	SMB_PROTOCOL_LANMAN1, 
+	SMB_PROTOCOL_LANMAN2, 
+	SMB_PROTOCOL_NT1 
+};
+
+enum smb_case_hndl {
+	SMB_CASE_DEFAULT,
+	SMB_CASE_LOWER,
+	SMB_CASE_UPPER
+};
+
+struct smb_dskattr {
+        __u16 total;
+        __u16 allocblocks;
+        __u16 blocksize;
+        __u16 free;
+};
+
+struct smb_conn_opt {
+
+        /* The socket */
+	unsigned int fd;
+
+	enum smb_protocol protocol;
+	enum smb_case_hndl case_handling;
+
+	/* Connection-Options */
+
+	__u32              max_xmit;
+	__u16              server_uid;
+	__u16              tid;
+
+        /* The following are LANMAN 1.0 options */
+        __u16              secmode;
+        __u16              maxmux;
+        __u16              maxvcs;
+        __u16              rawmode;
+        __u32              sesskey;
+
+	/* The following are NT LM 0.12 options */
+	__u32              maxraw;
+	__u32              capabilities;
+	__s16              serverzone;
+};
+
+#ifdef __KERNEL__
+
+#define SMB_NLS_MAXNAMELEN 20
+struct smb_nls_codepage {
+	char local_name[SMB_NLS_MAXNAMELEN];
+	char remote_name[SMB_NLS_MAXNAMELEN];
+};
+
+
+#define SMB_MAXNAMELEN 255
+#define SMB_MAXPATHLEN 1024
+
+/*
+ * Contains all relevant data on a SMB networked file.
+ */
+struct smb_fattr {
+	__u16 attr;
+
+	unsigned long	f_ino;
+	umode_t		f_mode;
+	nlink_t		f_nlink;
+	uid_t		f_uid;
+	gid_t		f_gid;
+	dev_t		f_rdev;
+	loff_t		f_size;
+	struct timespec	f_atime;
+	struct timespec f_mtime;
+	struct timespec f_ctime;
+	unsigned long	f_blocks;
+	int		f_unix;
+};
+
+enum smb_conn_state {
+	CONN_VALID,		/* everything's fine */
+	CONN_INVALID,		/* Something went wrong, but did not
+				   try to reconnect yet. */
+	CONN_RETRIED,		/* Tried a reconnection, but was refused */
+	CONN_RETRYING		/* Currently trying to reconnect */
+};
+
+#define SMB_HEADER_LEN   37     /* includes everything up to, but not
+                                 * including smb_bcc */
+
+#define SMB_INITIAL_PACKET_SIZE		4000
+#define SMB_MAX_PACKET_SIZE		32768
+
+/* reserve this much space for trans2 parameters. Shouldn't have to be more
+   than 10 or so, but OS/2 seems happier like this. */
+#define SMB_TRANS2_MAX_PARAM 64
+
+#endif
+#endif
diff --git a/drivers/staging/smbfs/smb_debug.h b/drivers/staging/smbfs/smb_debug.h
new file mode 100644
index 000000000000..fc4b1a5dd755
--- /dev/null
+++ b/drivers/staging/smbfs/smb_debug.h
@@ -0,0 +1,34 @@
+/*
+ * Defines some debug macros for smbfs.
+ */
+
+/* This makes a dentry parent/child name pair. Useful for debugging printk's */
+#define DENTRY_PATH(dentry) \
+	(dentry)->d_parent->d_name.name,(dentry)->d_name.name
+
+/*
+ * safety checks that should never happen ???
+ * these are normally enabled.
+ */
+#ifdef SMBFS_PARANOIA
+# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
+#else
+# define PARANOIA(f, a...) do { ; } while(0)
+#endif
+
+/* lots of debug messages */
+#ifdef SMBFS_DEBUG_VERBOSE
+# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
+#else
+# define VERBOSE(f, a...) do { ; } while(0)
+#endif
+
+/*
+ * "normal" debug messages, but not with a normal DEBUG define ... way
+ * too common name.
+ */
+#ifdef SMBFS_DEBUG
+#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
+#else
+#define DEBUG1(f, a...) do { ; } while(0)
+#endif
diff --git a/drivers/staging/smbfs/smb_fs.h b/drivers/staging/smbfs/smb_fs.h
new file mode 100644
index 000000000000..20a05c188eb9
--- /dev/null
+++ b/drivers/staging/smbfs/smb_fs.h
@@ -0,0 +1,153 @@
+/*
+ *  smb_fs.h
+ *
+ *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_FS_H
+#define _LINUX_SMB_FS_H
+
+#include "smb.h"
+
+/*
+ * ioctl commands
+ */
+#define	SMB_IOC_GETMOUNTUID		_IOR('u', 1, __kernel_old_uid_t)
+#define SMB_IOC_NEWCONN                 _IOW('u', 2, struct smb_conn_opt)
+
+/* __kernel_uid_t can never change, so we have to use __kernel_uid32_t */
+#define	SMB_IOC_GETMOUNTUID32		_IOR('u', 3, __kernel_uid32_t)
+
+
+#ifdef __KERNEL__
+#include "smb_fs_i.h"
+#include "smb_fs_sb.h"
+#include "smb_mount.h"
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/jiffies.h>
+#include <asm/unaligned.h>
+
+static inline struct smb_sb_info *SMB_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct smb_inode_info *SMB_I(struct inode *inode)
+{
+	return container_of(inode, struct smb_inode_info, vfs_inode);
+}
+
+/* macro names are short for word, double-word, long value (?) */
+#define WVAL(buf, pos) (get_unaligned_le16((u8 *)(buf) + (pos)))
+#define DVAL(buf, pos) (get_unaligned_le32((u8 *)(buf) + (pos)))
+#define LVAL(buf, pos) (get_unaligned_le64((u8 *)(buf) + (pos)))
+
+#define WSET(buf, pos, val) put_unaligned_le16((val), (u8 *)(buf) + (pos))
+#define DSET(buf, pos, val) put_unaligned_le32((val), (u8 *)(buf) + (pos))
+#define LSET(buf, pos, val) put_unaligned_le64((val), (u8 *)(buf) + (pos))
+
+/* where to find the base of the SMB packet proper */
+#define smb_base(buf) ((u8 *)(((u8 *)(buf))+4))
+
+/*
+ * Flags for the in-memory inode
+ */
+#define SMB_F_LOCALWRITE	0x02	/* file modified locally */
+
+
+/* NT1 protocol capability bits */
+#define SMB_CAP_RAW_MODE         0x00000001
+#define SMB_CAP_MPX_MODE         0x00000002
+#define SMB_CAP_UNICODE          0x00000004
+#define SMB_CAP_LARGE_FILES      0x00000008
+#define SMB_CAP_NT_SMBS          0x00000010
+#define SMB_CAP_RPC_REMOTE_APIS  0x00000020
+#define SMB_CAP_STATUS32         0x00000040
+#define SMB_CAP_LEVEL_II_OPLOCKS 0x00000080
+#define SMB_CAP_LOCK_AND_READ    0x00000100
+#define SMB_CAP_NT_FIND          0x00000200
+#define SMB_CAP_DFS              0x00001000
+#define SMB_CAP_LARGE_READX      0x00004000
+#define SMB_CAP_LARGE_WRITEX     0x00008000
+#define SMB_CAP_UNIX             0x00800000	/* unofficial ... */
+
+
+/*
+ * This is the time we allow an inode, dentry or dir cache to live. It is bad
+ * for performance to have shorter ttl on an inode than on the cache. It can
+ * cause refresh on each inode for a dir listing ... one-by-one
+ */
+#define SMB_MAX_AGE(server) (((server)->mnt->ttl * HZ) / 1000)
+
+static inline void
+smb_age_dentry(struct smb_sb_info *server, struct dentry *dentry)
+{
+	dentry->d_time = jiffies - SMB_MAX_AGE(server);
+}
+
+struct smb_cache_head {
+	time_t		mtime;	/* unused */
+	unsigned long	time;	/* cache age */
+	unsigned long	end;	/* last valid fpos in cache */
+	int		eof;
+};
+
+#define SMB_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+union smb_dir_cache {
+	struct smb_cache_head   head;
+	struct dentry           *dentry[SMB_DIRCACHE_SIZE];
+};
+
+#define SMB_FIRSTCACHE_SIZE	((int)((SMB_DIRCACHE_SIZE * \
+	sizeof(struct dentry *) - sizeof(struct smb_cache_head)) / \
+	sizeof(struct dentry *)))
+
+#define SMB_DIRCACHE_START      (SMB_DIRCACHE_SIZE - SMB_FIRSTCACHE_SIZE)
+
+struct smb_cache_control {
+	struct  smb_cache_head		head;
+	struct  page			*page;
+	union   smb_dir_cache		*cache;
+	unsigned long			fpos, ofs;
+	int				filled, valid, idx;
+};
+
+#define SMB_OPS_NUM_STATIC	5
+struct smb_ops {
+	int (*read)(struct inode *inode, loff_t offset, int count,
+		    char *data);
+	int (*write)(struct inode *inode, loff_t offset, int count, const
+		     char *data);
+	int (*readdir)(struct file *filp, void *dirent, filldir_t filldir,
+		       struct smb_cache_control *ctl);
+
+	int (*getattr)(struct smb_sb_info *server, struct dentry *dir,
+		       struct smb_fattr *fattr);
+	/* int (*setattr)(...); */      /* setattr is really icky! */
+
+	int (*truncate)(struct inode *inode, loff_t length);
+
+
+	/* --- --- --- end of "static" entries --- --- --- */
+
+	int (*convert)(unsigned char *output, int olen,
+		       const unsigned char *input, int ilen,
+		       struct nls_table *nls_from,
+		       struct nls_table *nls_to);
+};
+
+static inline int
+smb_is_open(struct inode *i)
+{
+	return (SMB_I(i)->open == server_from_inode(i)->generation);
+}
+
+extern void smb_install_null_ops(struct smb_ops *);
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SMB_FS_H */
diff --git a/drivers/staging/smbfs/smb_fs_i.h b/drivers/staging/smbfs/smb_fs_i.h
new file mode 100644
index 000000000000..8ccf4eca2c3d
--- /dev/null
+++ b/drivers/staging/smbfs/smb_fs_i.h
@@ -0,0 +1,37 @@
+/*
+ *  smb_fs_i.h
+ *
+ *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_FS_I
+#define _LINUX_SMB_FS_I
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+/*
+ * smb fs inode data (in memory only)
+ */
+struct smb_inode_info {
+
+	/*
+	 * file handles are local to a connection. A file is open if
+	 * (open == generation).
+	 */
+        unsigned int open;	/* open generation */
+	__u16 fileid;		/* What id to handle a file with? */
+	__u16 attr;		/* Attribute fields, DOS value */
+
+	__u16 access;		/* Access mode */
+	__u16 flags;
+	unsigned long oldmtime;	/* last time refreshed */
+	unsigned long closed;	/* timestamp when closed */
+	unsigned openers;	/* number of fileid users */
+
+	struct inode vfs_inode;	/* must be at the end */
+};
+
+#endif
diff --git a/drivers/staging/smbfs/smb_fs_sb.h b/drivers/staging/smbfs/smb_fs_sb.h
new file mode 100644
index 000000000000..ca058afda900
--- /dev/null
+++ b/drivers/staging/smbfs/smb_fs_sb.h
@@ -0,0 +1,100 @@
+/*
+ *  smb_fs_sb.h
+ *
+ *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _SMB_FS_SB
+#define _SMB_FS_SB
+
+#include <linux/types.h>
+#include <linux/backing-dev.h>
+#include "smb.h"
+
+/*
+ * Upper limit on the total number of active smb_request structs.
+ */
+#define MAX_REQUEST_HARD       256
+
+enum smb_receive_state {
+	SMB_RECV_START,		/* No data read, looking for length + sig */
+	SMB_RECV_HEADER,	/* Reading the header data */
+	SMB_RECV_HCOMPLETE,	/* Done with the header */
+	SMB_RECV_PARAM,		/* Reading parameter words */
+	SMB_RECV_DATA,		/* Reading data bytes */
+	SMB_RECV_END,		/* End of request */
+	SMB_RECV_DROP,		/* Dropping this SMB */
+	SMB_RECV_REQUEST,	/* Received a request and not a reply */
+};
+
+/* structure access macros */
+#define server_from_inode(inode) SMB_SB((inode)->i_sb)
+#define server_from_dentry(dentry) SMB_SB((dentry)->d_sb)
+#define SB_of(server) ((server)->super_block)
+
+struct smb_sb_info {
+	/* List of all smbfs superblocks */
+	struct list_head entry;
+
+        enum smb_conn_state state;
+	struct file * sock_file;
+	int conn_error;
+	enum smb_receive_state rstate;
+
+	atomic_t nr_requests;
+	struct list_head xmitq;
+	struct list_head recvq;
+	u16 mid;
+
+        struct smb_mount_data_kernel *mnt;
+
+	/* Connections are counted. Each time a new socket arrives,
+	 * generation is incremented.
+	 */
+	unsigned int generation;
+	struct pid *conn_pid;
+	struct smb_conn_opt opt;
+	wait_queue_head_t conn_wq;
+	int conn_complete;
+	struct semaphore sem;
+
+	unsigned char      header[SMB_HEADER_LEN + 20*2 + 2];
+	u32                header_len;
+	u32                smb_len;
+	u32                smb_read;
+
+        /* We use our own data_ready callback, but need the original one */
+        void *data_ready;
+
+	/* nls pointers for codepage conversions */
+	struct nls_table *remote_nls;
+	struct nls_table *local_nls;
+
+	struct smb_ops *ops;
+
+	struct super_block *super_block;
+
+	struct backing_dev_info bdi;
+};
+
+static inline int
+smb_lock_server_interruptible(struct smb_sb_info *server)
+{
+	return down_interruptible(&(server->sem));
+}
+
+static inline void
+smb_lock_server(struct smb_sb_info *server)
+{
+	down(&(server->sem));
+}
+
+static inline void
+smb_unlock_server(struct smb_sb_info *server)
+{
+	up(&(server->sem));
+}
+
+#endif
diff --git a/drivers/staging/smbfs/smb_mount.h b/drivers/staging/smbfs/smb_mount.h
new file mode 100644
index 000000000000..d10f00cb5703
--- /dev/null
+++ b/drivers/staging/smbfs/smb_mount.h
@@ -0,0 +1,65 @@
+/*
+ *  smb_mount.h
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_SMB_MOUNT_H
+#define _LINUX_SMB_MOUNT_H
+
+#include <linux/types.h>
+
+#define SMB_MOUNT_VERSION	6
+
+struct smb_mount_data {
+	int version;
+	__kernel_uid_t mounted_uid; /* Who may umount() this filesystem? */
+	__kernel_uid_t uid;
+	__kernel_gid_t gid;
+	__kernel_mode_t file_mode;
+	__kernel_mode_t dir_mode;
+};
+
+
+#ifdef __KERNEL__
+
+/* "vers" in big-endian */
+#define SMB_MOUNT_ASCII 0x76657273
+
+#define SMB_MOUNT_OLDVERSION	6
+#undef SMB_MOUNT_VERSION
+#define SMB_MOUNT_VERSION	7
+
+/* flags */
+#define SMB_MOUNT_WIN95		0x0001	/* Win 95 server */
+#define SMB_MOUNT_OLDATTR	0x0002	/* Use core getattr (Win 95 speedup) */
+#define SMB_MOUNT_DIRATTR	0x0004	/* Use find_first for getattr */
+#define SMB_MOUNT_CASE		0x0008	/* Be case sensitive */
+#define SMB_MOUNT_UNICODE	0x0010	/* Server talks unicode */
+#define SMB_MOUNT_UID		0x0020  /* Use user specified uid */
+#define SMB_MOUNT_GID		0x0040  /* Use user specified gid */
+#define SMB_MOUNT_FMODE		0x0080  /* Use user specified file mode */
+#define SMB_MOUNT_DMODE		0x0100  /* Use user specified dir mode */
+
+struct smb_mount_data_kernel {
+	int version;
+
+	uid_t mounted_uid;	/* Who may umount() this filesystem? */
+	uid_t uid;
+	gid_t gid;
+	mode_t file_mode;
+	mode_t dir_mode;
+
+	u32 flags;
+
+        /* maximum age in jiffies (inode, dentry and dircache) */
+	int ttl;
+
+	struct smb_nls_codepage codepage;
+};
+
+#endif
+
+#endif
diff --git a/drivers/staging/smbfs/smbfs.txt b/drivers/staging/smbfs/smbfs.txt
new file mode 100644
index 000000000000..194fb0decd2c
--- /dev/null
+++ b/drivers/staging/smbfs/smbfs.txt
@@ -0,0 +1,8 @@
+Smbfs is a filesystem that implements the SMB protocol, which is the
+protocol used by Windows for Workgroups, Windows 95 and Windows NT.
+Smbfs was inspired by Samba, the program written by Andrew Tridgell
+that turns any Unix host into a file server for DOS or Windows clients.
+
+Smbfs is a SMB client, but uses parts of samba for its operation. For
+more info on samba, including documentation, please go to
+http://www.samba.org/ and then on to your nearest mirror.
diff --git a/drivers/staging/smbfs/smbiod.c b/drivers/staging/smbfs/smbiod.c
new file mode 100644
index 000000000000..ec998920f8d9
--- /dev/null
+++ b/drivers/staging/smbfs/smbiod.c
@@ -0,0 +1,343 @@
+/*
+ *  smbiod.c
+ *
+ *  Copyright (C) 2000, Charles Loep / Corel Corp.
+ *  Copyright (C) 2001, Urban Widmark
+ */
+
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <net/ip.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "smb_fs.h"
+#include "smbno.h"
+#include "smb_mount.h"
+#include "smb_debug.h"
+#include "request.h"
+#include "proto.h"
+
+enum smbiod_state {
+	SMBIOD_DEAD,
+	SMBIOD_STARTING,
+	SMBIOD_RUNNING,
+};
+
+static enum smbiod_state smbiod_state = SMBIOD_DEAD;
+static struct task_struct *smbiod_thread;
+static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
+static LIST_HEAD(smb_servers);
+static DEFINE_SPINLOCK(servers_lock);
+
+#define SMBIOD_DATA_READY	(1<<0)
+static unsigned long smbiod_flags;
+
+static int smbiod(void *);
+static int smbiod_start(void);
+
+/*
+ * called when there's work for us to do
+ */
+void smbiod_wake_up(void)
+{
+	if (smbiod_state == SMBIOD_DEAD)
+		return;
+	set_bit(SMBIOD_DATA_READY, &smbiod_flags);
+	wake_up_interruptible(&smbiod_wait);
+}
+
+/*
+ * start smbiod if none is running
+ */
+static int smbiod_start(void)
+{
+	struct task_struct *tsk;
+	int err = 0;
+
+	if (smbiod_state != SMBIOD_DEAD)
+		return 0;
+	smbiod_state = SMBIOD_STARTING;
+	__module_get(THIS_MODULE);
+	spin_unlock(&servers_lock);
+	tsk = kthread_run(smbiod, NULL, "smbiod");
+	if (IS_ERR(tsk)) {
+		err = PTR_ERR(tsk);
+		module_put(THIS_MODULE);
+	}
+
+	spin_lock(&servers_lock);
+	if (err < 0) {
+		smbiod_state = SMBIOD_DEAD;
+		smbiod_thread = NULL;
+	} else {
+		smbiod_state = SMBIOD_RUNNING;
+		smbiod_thread = tsk;
+	}
+	return err;
+}
+
+/*
+ * register a server & start smbiod if necessary
+ */
+int smbiod_register_server(struct smb_sb_info *server)
+{
+	int ret;
+	spin_lock(&servers_lock);
+	list_add(&server->entry, &smb_servers);
+	VERBOSE("%p\n", server);
+	ret = smbiod_start();
+	spin_unlock(&servers_lock);
+	return ret;
+}
+
+/*
+ * Unregister a server
+ * Must be called with the server lock held.
+ */
+void smbiod_unregister_server(struct smb_sb_info *server)
+{
+	spin_lock(&servers_lock);
+	list_del_init(&server->entry);
+	VERBOSE("%p\n", server);
+	spin_unlock(&servers_lock);
+
+	smbiod_wake_up();
+	smbiod_flush(server);
+}
+
+void smbiod_flush(struct smb_sb_info *server)
+{
+	struct list_head *tmp, *n;
+	struct smb_request *req;
+
+	list_for_each_safe(tmp, n, &server->xmitq) {
+		req = list_entry(tmp, struct smb_request, rq_queue);
+		req->rq_errno = -EIO;
+		list_del_init(&req->rq_queue);
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+	list_for_each_safe(tmp, n, &server->recvq) {
+		req = list_entry(tmp, struct smb_request, rq_queue);
+		req->rq_errno = -EIO;
+		list_del_init(&req->rq_queue);
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+}
+
+/*
+ * Wake up smbmount and make it reconnect to the server.
+ * This must be called with the server locked.
+ *
+ * FIXME: add smbconnect version to this
+ */
+int smbiod_retry(struct smb_sb_info *server)
+{
+	struct list_head *head;
+	struct smb_request *req;
+	struct pid *pid = get_pid(server->conn_pid);
+	int result = 0;
+
+	VERBOSE("state: %d\n", server->state);
+	if (server->state == CONN_VALID || server->state == CONN_RETRYING)
+		goto out;
+
+	smb_invalidate_inodes(server);
+
+	/*
+	 * Some requests are meaningless after a retry, so we abort them.
+	 * One example are all requests using 'fileid' since the files are
+	 * closed on retry.
+	 */
+	head = server->xmitq.next;
+	while (head != &server->xmitq) {
+		req = list_entry(head, struct smb_request, rq_queue);
+		head = head->next;
+
+		req->rq_bytes_sent = 0;
+		if (req->rq_flags & SMB_REQ_NORETRY) {
+			VERBOSE("aborting request %p on xmitq\n", req);
+			req->rq_errno = -EIO;
+			list_del_init(&req->rq_queue);
+			smb_rput(req);
+			wake_up_interruptible(&req->rq_wait);
+		}
+	}
+
+	/*
+	 * FIXME: test the code for retrying request we already sent
+	 */
+	head = server->recvq.next;
+	while (head != &server->recvq) {
+		req = list_entry(head, struct smb_request, rq_queue);
+		head = head->next;
+#if 0
+		if (req->rq_flags & SMB_REQ_RETRY) {
+			/* must move the request to the xmitq */
+			VERBOSE("retrying request %p on recvq\n", req);
+			list_move(&req->rq_queue, &server->xmitq);
+			continue;
+		}
+#endif
+
+		VERBOSE("aborting request %p on recvq\n", req);
+		/* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
+		req->rq_errno = -EIO;
+		list_del_init(&req->rq_queue);
+		smb_rput(req);
+		wake_up_interruptible(&req->rq_wait);
+	}
+
+	smb_close_socket(server);
+
+	if (!pid) {
+		/* FIXME: this is fatal, umount? */
+		printk(KERN_ERR "smb_retry: no connection process\n");
+		server->state = CONN_RETRIED;
+		goto out;
+	}
+
+	/*
+	 * Change state so that only one retry per server will be started.
+	 */
+	server->state = CONN_RETRYING;
+
+	/*
+	 * Note: use the "priv" flag, as a user process may need to reconnect.
+	 */
+	result = kill_pid(pid, SIGUSR1, 1);
+	if (result) {
+		/* FIXME: this is most likely fatal, umount? */
+		printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
+		goto out;
+	}
+	VERBOSE("signalled pid %d\n", pid_nr(pid));
+
+	/* FIXME: The retried requests should perhaps get a "time boost". */
+
+out:
+	put_pid(pid);
+	return result;
+}
+
+/*
+ * Currently handles lockingX packets.
+ */
+static void smbiod_handle_request(struct smb_sb_info *server)
+{
+	PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
+	server->rstate = SMB_RECV_DROP;
+}
+
+/*
+ * Do some IO for one server.
+ */
+static void smbiod_doio(struct smb_sb_info *server)
+{
+	int result;
+	int maxwork = 7;
+
+	if (server->state != CONN_VALID)
+		goto out;
+
+	do {
+		result = smb_request_recv(server);
+		if (result < 0) {
+			server->state = CONN_INVALID;
+			smbiod_retry(server);
+			goto out;	/* reconnecting is slow */
+		} else if (server->rstate == SMB_RECV_REQUEST)
+			smbiod_handle_request(server);
+	} while (result > 0 && maxwork-- > 0);
+
+	/*
+	 * If there is more to read then we want to be sure to wake up again.
+	 */
+	if (server->state != CONN_VALID)
+		goto out;
+	if (smb_recv_available(server) > 0)
+		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
+
+	do {
+		result = smb_request_send_server(server);
+		if (result < 0) {
+			server->state = CONN_INVALID;
+			smbiod_retry(server);
+			goto out;	/* reconnecting is slow */
+		}
+	} while (result > 0);
+
+	/*
+	 * If the last request was not sent out we want to wake up again.
+	 */
+	if (!list_empty(&server->xmitq))
+		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
+
+out:
+	return;
+}
+
+/*
+ * smbiod kernel thread
+ */
+static int smbiod(void *unused)
+{
+	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
+
+	for (;;) {
+		struct smb_sb_info *server;
+		struct list_head *pos, *n;
+
+		/* FIXME: Use poll? */
+		wait_event_interruptible(smbiod_wait,
+			 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
+		if (signal_pending(current)) {
+			spin_lock(&servers_lock);
+			smbiod_state = SMBIOD_DEAD;
+			spin_unlock(&servers_lock);
+			break;
+		}
+
+		clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
+
+		spin_lock(&servers_lock);
+		if (list_empty(&smb_servers)) {
+			smbiod_state = SMBIOD_DEAD;
+			spin_unlock(&servers_lock);
+			break;
+		}
+
+		list_for_each_safe(pos, n, &smb_servers) {
+			server = list_entry(pos, struct smb_sb_info, entry);
+			VERBOSE("checking server %p\n", server);
+
+			if (server->state == CONN_VALID) {
+				spin_unlock(&servers_lock);
+
+				smb_lock_server(server);
+				smbiod_doio(server);
+				smb_unlock_server(server);
+
+				spin_lock(&servers_lock);
+			}
+		}
+		spin_unlock(&servers_lock);
+	}
+
+	VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
+	module_put_and_exit(0);
+}
diff --git a/drivers/staging/smbfs/smbno.h b/drivers/staging/smbfs/smbno.h
new file mode 100644
index 000000000000..f99e02d9ffe2
--- /dev/null
+++ b/drivers/staging/smbfs/smbno.h
@@ -0,0 +1,363 @@
+#ifndef _SMBNO_H_
+#define _SMBNO_H_
+
+/* these define the attribute byte as seen by DOS */
+#define aRONLY	(1L<<0)
+#define aHIDDEN	(1L<<1)
+#define aSYSTEM	(1L<<2)
+#define aVOLID	(1L<<3)
+#define aDIR	(1L<<4)
+#define aARCH	(1L<<5)
+
+/* error classes */
+#define SUCCESS 0  /* The request was successful. */
+#define ERRDOS 0x01 /*  Error is from the core DOS operating system set. */
+#define ERRSRV 0x02 /* Error is generated by the server network file manager.*/
+#define ERRHRD 0x03  /* Error is an hardware error. */
+#define ERRCMD 0xFF  /* Command was not in the "SMB" format. */
+
+/* SMB X/Open error codes for the ERRdos error class */
+
+#define ERRbadfunc 1            /* Invalid function (or system call) */
+#define ERRbadfile 2            /* File not found (pathname error) */
+#define ERRbadpath 3            /* Directory not found */
+#define ERRnofids 4             /* Too many open files */
+#define ERRnoaccess 5           /* Access denied */
+#define ERRbadfid 6             /* Invalid fid */
+#define ERRbadmcb 7             /* Memory control blocks destroyed */
+#define ERRnomem 8              /* Out of memory */
+#define ERRbadmem 9             /* Invalid memory block address */
+#define ERRbadenv 10            /* Invalid environment */
+#define ERRbadformat 11         /* Invalid format */
+#define ERRbadaccess 12         /* Invalid open mode */
+#define ERRbaddata 13           /* Invalid data (only from ioctl call) */
+#define ERRres 14               /* reserved */
+#define ERRbaddrive 15          /* Invalid drive */
+#define ERRremcd 16             /* Attempt to delete current directory */
+#define ERRdiffdevice 17        /* rename/move across different filesystems */
+#define ERRnofiles 18           /* no more files found in file search */
+#define ERRbadshare 32          /* Share mode on file conflict with open mode */
+#define ERRlock 33              /* Lock request conflicts with existing lock */
+#define ERRfilexists 80         /* File in operation already exists */
+#define ERRbadpipe 230          /* Named pipe invalid */
+#define ERRpipebusy 231         /* All instances of pipe are busy */
+#define ERRpipeclosing 232      /* named pipe close in progress */
+#define ERRnotconnected 233     /* No process on other end of named pipe */
+#define ERRmoredata 234         /* More data to be returned */
+
+#define ERROR_INVALID_PARAMETER	 87
+#define ERROR_DISK_FULL		112
+#define ERROR_INVALID_NAME	123
+#define ERROR_DIR_NOT_EMPTY	145
+#define ERROR_NOT_LOCKED	158
+#define ERROR_ALREADY_EXISTS	183  /* see also 80 ? */
+#define ERROR_EAS_DIDNT_FIT	275 /* Extended attributes didn't fit */
+#define ERROR_EAS_NOT_SUPPORTED	282 /* Extended attributes not supported */
+
+/* Error codes for the ERRSRV class */
+
+#define ERRerror 1              /* Non specific error code */
+#define ERRbadpw 2              /* Bad password */
+#define ERRbadtype 3            /* reserved */
+#define ERRaccess 4          /* No permissions to do the requested operation */
+#define ERRinvnid 5             /* tid invalid */
+#define ERRinvnetname 6         /* Invalid servername */
+#define ERRinvdevice 7          /* Invalid device */
+#define ERRqfull 49             /* Print queue full */
+#define ERRqtoobig 50           /* Queued item too big */
+#define ERRinvpfid 52           /* Invalid print file in smb_fid */
+#define ERRsmbcmd 64            /* Unrecognised command */
+#define ERRsrverror 65          /* smb server internal error */
+#define ERRfilespecs 67         /* fid and pathname invalid combination */
+#define ERRbadlink 68           /* reserved */
+#define ERRbadpermits 69        /* Access specified for a file is not valid */
+#define ERRbadpid 70            /* reserved */
+#define ERRsetattrmode 71       /* attribute mode invalid */
+#define ERRpaused 81            /* Message server paused */
+#define ERRmsgoff 82            /* Not receiving messages */
+#define ERRnoroom 83            /* No room for message */
+#define ERRrmuns 87             /* too many remote usernames */
+#define ERRtimeout 88           /* operation timed out */
+#define ERRnoresource  89   /* No resources currently available for request. */
+#define ERRtoomanyuids 90       /* too many userids */
+#define ERRbaduid 91            /* bad userid */
+#define ERRuseMPX 250    /* temporarily unable to use raw mode, use MPX mode */
+#define ERRuseSTD 251    /* temporarily unable to use raw mode, use std.mode */
+#define ERRcontMPX 252          /* resume MPX mode */
+#define ERRbadPW                /* reserved */
+#define ERRnosupport 0xFFFF
+
+/* Error codes for the ERRHRD class */
+
+#define ERRnowrite 19           /* read only media */
+#define ERRbadunit 20           /* Unknown device */
+#define ERRnotready 21          /* Drive not ready */
+#define ERRbadcmd 22            /* Unknown command */
+#define ERRdata 23              /* Data (CRC) error */
+#define ERRbadreq 24            /* Bad request structure length */
+#define ERRseek 25
+#define ERRbadmedia 26
+#define ERRbadsector 27
+#define ERRnopaper 28
+#define ERRwrite 29             /* write fault */
+#define ERRread 30              /* read fault */
+#define ERRgeneral 31           /* General hardware failure */
+#define ERRwrongdisk 34
+#define ERRFCBunavail 35
+#define ERRsharebufexc 36       /* share buffer exceeded */
+#define ERRdiskfull 39
+
+/*
+ * Access modes when opening a file
+ */
+#define SMB_ACCMASK	0x0003
+#define SMB_O_RDONLY	0x0000
+#define SMB_O_WRONLY	0x0001
+#define SMB_O_RDWR	0x0002
+
+/* offsets into message for common items */
+#define smb_com 8
+#define smb_rcls 9
+#define smb_reh 10
+#define smb_err 11
+#define smb_flg 13
+#define smb_flg2 14
+#define smb_reb 13
+#define smb_tid 28
+#define smb_pid 30
+#define smb_uid 32
+#define smb_mid 34
+#define smb_wct 36
+#define smb_vwv 37
+#define smb_vwv0 37
+#define smb_vwv1 39
+#define smb_vwv2 41
+#define smb_vwv3 43
+#define smb_vwv4 45
+#define smb_vwv5 47
+#define smb_vwv6 49
+#define smb_vwv7 51
+#define smb_vwv8 53
+#define smb_vwv9 55
+#define smb_vwv10 57
+#define smb_vwv11 59
+#define smb_vwv12 61
+#define smb_vwv13 63
+#define smb_vwv14 65
+
+/* these are the trans2 sub fields for primary requests */
+#define smb_tpscnt smb_vwv0
+#define smb_tdscnt smb_vwv1
+#define smb_mprcnt smb_vwv2
+#define smb_mdrcnt smb_vwv3
+#define smb_msrcnt smb_vwv4
+#define smb_flags smb_vwv5
+#define smb_timeout smb_vwv6
+#define smb_pscnt smb_vwv9
+#define smb_psoff smb_vwv10
+#define smb_dscnt smb_vwv11
+#define smb_dsoff smb_vwv12
+#define smb_suwcnt smb_vwv13
+#define smb_setup smb_vwv14
+#define smb_setup0 smb_setup
+#define smb_setup1 (smb_setup+2)
+#define smb_setup2 (smb_setup+4)
+
+/* these are for the secondary requests */
+#define smb_spscnt smb_vwv2
+#define smb_spsoff smb_vwv3
+#define smb_spsdisp smb_vwv4
+#define smb_sdscnt smb_vwv5
+#define smb_sdsoff smb_vwv6
+#define smb_sdsdisp smb_vwv7
+#define smb_sfid smb_vwv8
+
+/* and these for responses */
+#define smb_tprcnt smb_vwv0
+#define smb_tdrcnt smb_vwv1
+#define smb_prcnt smb_vwv3
+#define smb_proff smb_vwv4
+#define smb_prdisp smb_vwv5
+#define smb_drcnt smb_vwv6
+#define smb_droff smb_vwv7
+#define smb_drdisp smb_vwv8
+
+/* the complete */
+#define SMBmkdir      0x00   /* create directory */
+#define SMBrmdir      0x01   /* delete directory */
+#define SMBopen       0x02   /* open file */
+#define SMBcreate     0x03   /* create file */
+#define SMBclose      0x04   /* close file */
+#define SMBflush      0x05   /* flush file */
+#define SMBunlink     0x06   /* delete file */
+#define SMBmv         0x07   /* rename file */
+#define SMBgetatr     0x08   /* get file attributes */
+#define SMBsetatr     0x09   /* set file attributes */
+#define SMBread       0x0A   /* read from file */
+#define SMBwrite      0x0B   /* write to file */
+#define SMBlock       0x0C   /* lock byte range */
+#define SMBunlock     0x0D   /* unlock byte range */
+#define SMBctemp      0x0E   /* create temporary file */
+#define SMBmknew      0x0F   /* make new file */
+#define SMBchkpth     0x10   /* check directory path */
+#define SMBexit       0x11   /* process exit */
+#define SMBlseek      0x12   /* seek */
+#define SMBtcon       0x70   /* tree connect */
+#define SMBtconX      0x75   /* tree connect and X*/
+#define SMBtdis       0x71   /* tree disconnect */
+#define SMBnegprot    0x72   /* negotiate protocol */
+#define SMBdskattr    0x80   /* get disk attributes */
+#define SMBsearch     0x81   /* search directory */
+#define SMBsplopen    0xC0   /* open print spool file */
+#define SMBsplwr      0xC1   /* write to print spool file */
+#define SMBsplclose   0xC2   /* close print spool file */
+#define SMBsplretq    0xC3   /* return print queue */
+#define SMBsends      0xD0   /* send single block message */
+#define SMBsendb      0xD1   /* send broadcast message */
+#define SMBfwdname    0xD2   /* forward user name */
+#define SMBcancelf    0xD3   /* cancel forward */
+#define SMBgetmac     0xD4   /* get machine name */
+#define SMBsendstrt   0xD5   /* send start of multi-block message */
+#define SMBsendend    0xD6   /* send end of multi-block message */
+#define SMBsendtxt    0xD7   /* send text of multi-block message */
+
+/* Core+ protocol */
+#define SMBlockread	  0x13   /* Lock a range and read */
+#define SMBwriteunlock 0x14 /* Unlock a range then write */
+#define SMBreadbraw   0x1a  /* read a block of data with no smb header */
+#define SMBwritebraw  0x1d  /* write a block of data with no smb header */
+#define SMBwritec     0x20  /* secondary write request */
+#define SMBwriteclose 0x2c  /* write a file then close it */
+
+/* dos extended protocol */
+#define SMBreadBraw      0x1A   /* read block raw */
+#define SMBreadBmpx      0x1B   /* read block multiplexed */
+#define SMBreadBs        0x1C   /* read block (secondary response) */
+#define SMBwriteBraw     0x1D   /* write block raw */
+#define SMBwriteBmpx     0x1E   /* write block multiplexed */
+#define SMBwriteBs       0x1F   /* write block (secondary request) */
+#define SMBwriteC        0x20   /* write complete response */
+#define SMBsetattrE      0x22   /* set file attributes expanded */
+#define SMBgetattrE      0x23   /* get file attributes expanded */
+#define SMBlockingX      0x24   /* lock/unlock byte ranges and X */
+#define SMBtrans         0x25   /* transaction - name, bytes in/out */
+#define SMBtranss        0x26   /* transaction (secondary request/response) */
+#define SMBioctl         0x27   /* IOCTL */
+#define SMBioctls        0x28   /* IOCTL  (secondary request/response) */
+#define SMBcopy          0x29   /* copy */
+#define SMBmove          0x2A   /* move */
+#define SMBecho          0x2B   /* echo */
+#define SMBopenX         0x2D   /* open and X */
+#define SMBreadX         0x2E   /* read and X */
+#define SMBwriteX        0x2F   /* write and X */
+#define SMBsesssetupX    0x73   /* Session Set Up & X (including User Logon) */
+#define SMBtconX         0x75   /* tree connect and X */
+#define SMBffirst        0x82   /* find first */
+#define SMBfunique       0x83   /* find unique */
+#define SMBfclose        0x84   /* find close */
+#define SMBinvalid       0xFE   /* invalid command */
+
+
+/* Extended 2.0 protocol */
+#define SMBtrans2        0x32   /* TRANS2 protocol set */
+#define SMBtranss2       0x33   /* TRANS2 protocol set, secondary command */
+#define SMBfindclose     0x34   /* Terminate a TRANSACT2_FINDFIRST */
+#define SMBfindnclose    0x35   /* Terminate a TRANSACT2_FINDNOTIFYFIRST */
+#define SMBulogoffX      0x74   /* user logoff */
+
+/* these are the TRANS2 sub commands */
+#define TRANSACT2_OPEN          0
+#define TRANSACT2_FINDFIRST     1
+#define TRANSACT2_FINDNEXT      2
+#define TRANSACT2_QFSINFO       3
+#define TRANSACT2_SETFSINFO     4
+#define TRANSACT2_QPATHINFO     5
+#define TRANSACT2_SETPATHINFO   6
+#define TRANSACT2_QFILEINFO     7
+#define TRANSACT2_SETFILEINFO   8
+#define TRANSACT2_FSCTL         9
+#define TRANSACT2_IOCTL           10
+#define TRANSACT2_FINDNOTIFYFIRST 11
+#define TRANSACT2_FINDNOTIFYNEXT  12
+#define TRANSACT2_MKDIR           13
+
+/* Information Levels -  Shared? */
+#define SMB_INFO_STANDARD		1
+#define SMB_INFO_QUERY_EA_SIZE		2
+#define SMB_INFO_QUERY_EAS_FROM_LIST	3
+#define SMB_INFO_QUERY_ALL_EAS		4
+#define SMB_INFO_IS_NAME_VALID		6
+
+/* Information Levels -  TRANSACT2_FINDFIRST */
+#define SMB_FIND_FILE_DIRECTORY_INFO		0x101
+#define SMB_FIND_FILE_FULL_DIRECTORY_INFO	0x102
+#define SMB_FIND_FILE_NAMES_INFO		0x103
+#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO	0x104
+
+/* Information Levels -  TRANSACT2_QPATHINFO */
+#define SMB_QUERY_FILE_BASIC_INFO	0x101
+#define SMB_QUERY_FILE_STANDARD_INFO	0x102
+#define SMB_QUERY_FILE_EA_INFO		0x103
+#define SMB_QUERY_FILE_NAME_INFO	0x104
+#define SMB_QUERY_FILE_ALL_INFO		0x107
+#define SMB_QUERY_FILE_ALT_NAME_INFO	0x108
+#define SMB_QUERY_FILE_STREAM_INFO	0x109
+#define SMB_QUERY_FILE_COMPRESSION_INFO	0x10b
+
+/* Information Levels - TRANSACT2_SETFILEINFO */
+#define SMB_SET_FILE_BASIC_INFO		0x101
+#define SMB_SET_FILE_DISPOSITION_INFO	0x102
+#define SMB_SET_FILE_ALLOCATION_INFO	0x103
+#define SMB_SET_FILE_END_OF_FILE_INFO	0x104
+
+/* smb_flg field flags */
+#define SMB_FLAGS_SUPPORT_LOCKREAD	0x01
+#define SMB_FLAGS_CLIENT_BUF_AVAIL	0x02
+#define SMB_FLAGS_RESERVED		0x04
+#define SMB_FLAGS_CASELESS_PATHNAMES	0x08
+#define SMB_FLAGS_CANONICAL_PATHNAMES	0x10
+#define SMB_FLAGS_REQUEST_OPLOCK	0x20
+#define SMB_FLAGS_REQUEST_BATCH_OPLOCK	0x40
+#define SMB_FLAGS_REPLY			0x80
+
+/* smb_flg2 field flags (samba-2.2.0/source/include/smb.h) */
+#define SMB_FLAGS2_LONG_PATH_COMPONENTS		0x0001
+#define SMB_FLAGS2_EXTENDED_ATTRIBUTES		0x0002
+#define SMB_FLAGS2_DFS_PATHNAMES		0x1000
+#define SMB_FLAGS2_READ_PERMIT_NO_EXECUTE	0x2000
+#define SMB_FLAGS2_32_BIT_ERROR_CODES		0x4000 
+#define SMB_FLAGS2_UNICODE_STRINGS		0x8000
+
+
+/*
+ * UNIX stuff  (from samba trans2.h)
+ */
+#define MIN_UNIX_INFO_LEVEL		0x200
+#define MAX_UNIX_INFO_LEVEL		0x2FF
+#define SMB_FIND_FILE_UNIX		0x202
+#define SMB_QUERY_FILE_UNIX_BASIC	0x200
+#define SMB_QUERY_FILE_UNIX_LINK	0x201
+#define SMB_QUERY_FILE_UNIX_HLINK	0x202
+#define SMB_SET_FILE_UNIX_BASIC		0x200
+#define SMB_SET_FILE_UNIX_LINK		0x201
+#define SMB_SET_FILE_UNIX_HLINK		0x203
+#define SMB_QUERY_CIFS_UNIX_INFO	0x200
+
+/* values which means "don't change it" */
+#define SMB_MODE_NO_CHANGE		0xFFFFFFFF
+#define SMB_UID_NO_CHANGE		0xFFFFFFFF
+#define SMB_GID_NO_CHANGE		0xFFFFFFFF
+#define SMB_TIME_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
+#define SMB_SIZE_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
+
+/* UNIX filetype mappings. */
+#define UNIX_TYPE_FILE		0
+#define UNIX_TYPE_DIR		1
+#define UNIX_TYPE_SYMLINK	2
+#define UNIX_TYPE_CHARDEV	3
+#define UNIX_TYPE_BLKDEV	4
+#define UNIX_TYPE_FIFO		5
+#define UNIX_TYPE_SOCKET	6
+#define UNIX_TYPE_UNKNOWN	0xFFFFFFFF
+
+#endif /* _SMBNO_H_ */
diff --git a/drivers/staging/smbfs/sock.c b/drivers/staging/smbfs/sock.c
new file mode 100644
index 000000000000..9e264090e611
--- /dev/null
+++ b/drivers/staging/smbfs/sock.c
@@ -0,0 +1,385 @@
+/*
+ *  sock.c
+ *
+ *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
+ *  Copyright (C) 1997 by Volker Lendecke
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <net/scm.h>
+#include <net/tcp_states.h>
+#include <net/ip.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include "smb_fs.h"
+#include "smb.h"
+#include "smbno.h"
+#include "smb_debug.h"
+#include "proto.h"
+#include "request.h"
+
+
+static int
+_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
+{
+	struct kvec iov = {ubuf, size};
+	struct msghdr msg = {.msg_flags = flags};
+	msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
+	return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
+}
+
+/*
+ * Return the server this socket belongs to
+ */
+static struct smb_sb_info *
+server_from_socket(struct socket *socket)
+{
+	return socket->sk->sk_user_data;
+}
+
+/*
+ * Called when there is data on the socket.
+ */
+void
+smb_data_ready(struct sock *sk, int len)
+{
+	struct smb_sb_info *server = server_from_socket(sk->sk_socket);
+	void (*data_ready)(struct sock *, int) = server->data_ready;
+
+	data_ready(sk, len);
+	VERBOSE("(%p, %d)\n", sk, len);
+	smbiod_wake_up();
+}
+
+int
+smb_valid_socket(struct inode * inode)
+{
+	return (inode && S_ISSOCK(inode->i_mode) && 
+		SOCKET_I(inode)->type == SOCK_STREAM);
+}
+
+static struct socket *
+server_sock(struct smb_sb_info *server)
+{
+	struct file *file;
+
+	if (server && (file = server->sock_file))
+	{
+#ifdef SMBFS_PARANOIA
+		if (!smb_valid_socket(file->f_path.dentry->d_inode))
+			PARANOIA("bad socket!\n");
+#endif
+		return SOCKET_I(file->f_path.dentry->d_inode);
+	}
+	return NULL;
+}
+
+void
+smb_close_socket(struct smb_sb_info *server)
+{
+	struct file * file = server->sock_file;
+
+	if (file) {
+		struct socket *sock = server_sock(server);
+
+		VERBOSE("closing socket %p\n", sock);
+		sock->sk->sk_data_ready = server->data_ready;
+		server->sock_file = NULL;
+		fput(file);
+	}
+}
+
+static int
+smb_get_length(struct socket *socket, unsigned char *header)
+{
+	int result;
+
+	result = _recvfrom(socket, header, 4, MSG_PEEK);
+	if (result == -EAGAIN)
+		return -ENODATA;
+	if (result < 0) {
+		PARANOIA("recv error = %d\n", -result);
+		return result;
+	}
+	if (result < 4)
+		return -ENODATA;
+
+	switch (header[0]) {
+	case 0x00:
+	case 0x82:
+		break;
+
+	case 0x85:
+		DEBUG1("Got SESSION KEEP ALIVE\n");
+		_recvfrom(socket, header, 4, 0);	/* read away */
+		return -ENODATA;
+
+	default:
+		PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
+		return -EIO;
+	}
+
+	/* The length in the RFC NB header is the raw data length */
+	return smb_len(header);
+}
+
+int
+smb_recv_available(struct smb_sb_info *server)
+{
+	mm_segment_t oldfs;
+	int avail, err;
+	struct socket *sock = server_sock(server);
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
+	set_fs(oldfs);
+	return (err >= 0) ? avail : err;
+}
+
+/*
+ * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
+ */
+static int
+smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
+{
+	struct kvec *iv = *data;
+	int i;
+	int len;
+
+	/*
+	 *	Eat any sent kvecs
+	 */
+	while (iv->iov_len <= amount) {
+		amount -= iv->iov_len;
+		iv++;
+		(*num)--;
+	}
+
+	/*
+	 *	And chew down the partial one
+	 */
+	vec[0].iov_len = iv->iov_len-amount;
+	vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
+	iv++;
+
+	len = vec[0].iov_len;
+
+	/*
+	 *	And copy any others
+	 */
+	for (i = 1; i < *num; i++) {
+		vec[i] = *iv++;
+		len += vec[i].iov_len;
+	}
+
+	*data = vec;
+	return len;
+}
+
+/*
+ * smb_receive_header
+ * Only called by the smbiod thread.
+ */
+int
+smb_receive_header(struct smb_sb_info *server)
+{
+	struct socket *sock;
+	int result = 0;
+	unsigned char peek_buf[4];
+
+	result = -EIO; 
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	if (!server->smb_read) {
+		result = smb_get_length(sock, peek_buf);
+		if (result < 0) {
+			if (result == -ENODATA)
+				result = 0;
+			goto out;
+		}
+		server->smb_len = result + 4;
+
+		if (server->smb_len < SMB_HEADER_LEN) {
+			PARANOIA("short packet: %d\n", result);
+			server->rstate = SMB_RECV_DROP;
+			result = -EIO;
+			goto out;
+		}
+		if (server->smb_len > SMB_MAX_PACKET_SIZE) {
+			PARANOIA("long packet: %d\n", result);
+			server->rstate = SMB_RECV_DROP;
+			result = -EIO;
+			goto out;
+		}
+	}
+
+	result = _recvfrom(sock, server->header + server->smb_read,
+			   SMB_HEADER_LEN - server->smb_read, 0);
+	VERBOSE("_recvfrom: %d\n", result);
+	if (result < 0) {
+		VERBOSE("receive error: %d\n", result);
+		goto out;
+	}
+	server->smb_read += result;
+
+	if (server->smb_read == SMB_HEADER_LEN)
+		server->rstate = SMB_RECV_HCOMPLETE;
+out:
+	return result;
+}
+
+static char drop_buffer[PAGE_SIZE];
+
+/*
+ * smb_receive_drop - read and throw away the data
+ * Only called by the smbiod thread.
+ *
+ * FIXME: we are in the kernel, could we just tell the socket that we want
+ * to drop stuff from the buffer?
+ */
+int
+smb_receive_drop(struct smb_sb_info *server)
+{
+	struct socket *sock;
+	unsigned int flags;
+	struct kvec iov;
+	struct msghdr msg;
+	int rlen = smb_len(server->header) - server->smb_read + 4;
+	int result = -EIO;
+
+	if (rlen > PAGE_SIZE)
+		rlen = PAGE_SIZE;
+
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	iov.iov_base = drop_buffer;
+	iov.iov_len = PAGE_SIZE;
+	msg.msg_flags = flags;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+
+	result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
+
+	VERBOSE("read: %d\n", result);
+	if (result < 0) {
+		VERBOSE("receive error: %d\n", result);
+		goto out;
+	}
+	server->smb_read += result;
+
+	if (server->smb_read >= server->smb_len)
+		server->rstate = SMB_RECV_END;
+
+out:
+	return result;
+}
+
+/*
+ * smb_receive
+ * Only called by the smbiod thread.
+ */
+int
+smb_receive(struct smb_sb_info *server, struct smb_request *req)
+{
+	struct socket *sock;
+	unsigned int flags;
+	struct kvec iov[4];
+	struct kvec *p = req->rq_iov;
+	size_t num = req->rq_iovlen;
+	struct msghdr msg;
+	int rlen;
+	int result = -EIO;
+
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	msg.msg_flags = flags;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+
+	/* Dont repeat bytes and count available bufferspace */
+	rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
+			(req->rq_rlen - req->rq_bytes_recvd));
+
+	result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
+
+	VERBOSE("read: %d\n", result);
+	if (result < 0) {
+		VERBOSE("receive error: %d\n", result);
+		goto out;
+	}
+	req->rq_bytes_recvd += result;
+	server->smb_read += result;
+
+out:
+	return result;
+}
+
+/*
+ * Try to send a SMB request. This may return after sending only parts of the
+ * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
+ *
+ * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
+ */
+int
+smb_send_request(struct smb_request *req)
+{
+	struct smb_sb_info *server = req->rq_server;
+	struct socket *sock;
+	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
+        int slen = req->rq_slen - req->rq_bytes_sent;
+	int result = -EIO;
+	struct kvec iov[4];
+	struct kvec *p = req->rq_iov;
+	size_t num = req->rq_iovlen;
+
+	sock = server_sock(server);
+	if (!sock)
+		goto out;
+	if (sock->sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	/* Dont repeat bytes */
+	if (req->rq_bytes_sent)
+		smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
+
+	result = kernel_sendmsg(sock, &msg, p, num, slen);
+
+	if (result >= 0) {
+		req->rq_bytes_sent += result;
+		if (req->rq_bytes_sent >= req->rq_slen)
+			req->rq_flags |= SMB_REQ_TRANSMITTED;
+	}
+out:
+	return result;
+}
diff --git a/drivers/staging/smbfs/symlink.c b/drivers/staging/smbfs/symlink.c
new file mode 100644
index 000000000000..632c4acd062d
--- /dev/null
+++ b/drivers/staging/smbfs/symlink.c
@@ -0,0 +1,67 @@
+/*
+ *  symlink.c
+ *
+ *  Copyright (C) 2002 by John Newbigin
+ *
+ *  Please add a note about your changes to smbfs in the ChangeLog file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/net.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "smbno.h"
+#include "smb_fs.h"
+#include "smb_debug.h"
+#include "proto.h"
+
+int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
+{
+	DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
+
+	return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
+}
+
+static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = __getname();
+	DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
+
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+	} else {
+		int len = smb_proc_read_link(server_from_dentry(dentry),
+						dentry, link, PATH_MAX - 1);
+		if (len < 0) {
+			__putname(link);
+			link = ERR_PTR(len);
+		} else {
+			link[len] = 0;
+		}
+	}
+	nd_set_link(nd, link);
+	return NULL;
+}
+
+static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+const struct inode_operations smb_link_inode_operations =
+{
+	.readlink	= generic_readlink,
+	.follow_link	= smb_follow_link,
+	.put_link	= smb_put_link,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index 30da8ee16a96..25ce2dc1c6d4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,7 +233,6 @@ config NFS_COMMON
 	default y
 
 source "net/sunrpc/Kconfig"
-source "fs/smbfs/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index e571feddd7b7..9284c74c2db9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -91,7 +91,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)		+= lockd/
 obj-$(CONFIG_NLS)		+= nls/
 obj-$(CONFIG_SYSV_FS)		+= sysv/
-obj-$(CONFIG_SMB_FS)		+= smbfs/
 obj-$(CONFIG_CIFS)		+= cifs/
 obj-$(CONFIG_NCP_FS)		+= ncpfs/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec1..b42f29a44edb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/smb.h>
-#include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
@@ -745,30 +743,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
 	return raw_data;
 }
 
-struct compat_smb_mount_data {
-	compat_int_t version;
-	__compat_uid_t mounted_uid;
-	__compat_uid_t uid;
-	__compat_gid_t gid;
-	compat_mode_t file_mode;
-	compat_mode_t dir_mode;
-};
-
-static void *do_smb_super_data_conv(void *raw_data)
-{
-	struct smb_mount_data *s = raw_data;
-	struct compat_smb_mount_data *c_s = raw_data;
-
-	if (c_s->version != SMB_MOUNT_OLDVERSION)
-		goto out;
-	s->dir_mode = c_s->dir_mode;
-	s->file_mode = c_s->file_mode;
-	s->gid = c_s->gid;
-	s->uid = c_s->uid;
-	s->mounted_uid = c_s->mounted_uid;
- out:
-	return raw_data;
-}
 
 struct compat_nfs_string {
 	compat_uint_t len;
@@ -835,7 +809,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
 	return 0;
 }
 
-#define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
 
@@ -870,9 +843,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
 	retval = -EINVAL;
 
 	if (kernel_type && data_page) {
-		if (!strcmp(kernel_type, SMBFS_NAME)) {
-			do_smb_super_data_conv((void *)data_page);
-		} else if (!strcmp(kernel_type, NCPFS_NAME)) {
+		if (!strcmp(kernel_type, NCPFS_NAME)) {
 			do_ncp_super_data_conv((void *)data_page);
 		} else if (!strcmp(kernel_type, NFS4_NAME)) {
 			if (do_nfs4_super_data_conv((void *) data_page))
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..34cf03cd791f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -46,7 +46,6 @@
 #include <linux/videodev.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
-#include <linux/smb_fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -558,25 +557,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 
 #endif /* CONFIG_BLOCK */
 
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
-			compat_uid_t __user *argp)
-{
-	mm_segment_t old_fs = get_fs();
-	__kernel_uid_t kuid;
-	int err;
-
-	cmd = SMB_IOC_GETMOUNTUID;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-	set_fs(old_fs);
-
-	if (err >= 0)
-		err = put_user(kuid, argp);
-
-	return err;
-}
-
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO		_IOW('U', 200, int)
 #define HCIUARTGETPROTO		_IOR('U', 201, int)
@@ -1265,8 +1245,6 @@ COMPATIBLE_IOCTL(OSS_GETVERSION)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
-/* SMB ioctls which do not need any translations */
-COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1528,10 +1506,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case RAW_GETBIND:
 		return raw_ioctl(fd, cmd, argp);
 #endif
-	/* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-	case SMB_IOC_GETMOUNTUID_32:
-		return do_smb_getmountuid(fd, cmd, argp);
 	/* Serial */
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2e..000000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
-config SMB_FS
-	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-	depends on INET
-	select NLS
-	help
-	  SMB (Server Message Block) is the protocol Windows for Workgroups
-	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-	  files and printers over local networks.  Saying Y here allows you to
-	  mount their file systems (often called "shares" in this context) and
-	  access them just like any other Unix directory.  Currently, this
-	  works only if the Windows machines use TCP/IP as the underlying
-	  transport protocol, and not NetBEUI.  For details, read
-	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>.
-
-	  Note: if you just want your box to act as an SMB *server* and make
-	  files and printing services available to Windows clients (which need
-	  to have a TCP/IP stack), you don't need to say Y here; you can use
-	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-	  for that.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile the SMB support as a module, choose M here:
-	  the module will be called smbfs.  Most people say N, however.
-
-config SMB_NLS_DEFAULT
-	bool "Use a default NLS"
-	depends on SMB_FS
-	help
-	  Enabling this will make smbfs use nls translations by default. You
-	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-	  settings and you need to give the default nls for the SMB server as
-	  CONFIG_SMB_NLS_REMOTE.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
-config SMB_NLS_REMOTE
-	string "Default Remote NLS Option"
-	depends on SMB_NLS_DEFAULT
-	default "cp437"
-	help
-	  This setting allows you to specify a default value for which
-	  codepage the server uses. If this field is left blank no
-	  translations will be done by default. The local codepage/charset
-	  default to CONFIG_NLS_DEFAULT.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c3..000000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Makefile for the linux smb-filesystem routines.
-#
-
-obj-$(CONFIG_SMB_FS) += smbfs.o
-
-smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
-		symlink.o smbiod.o request.o
-
-# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
-# SMBFS_PARANOIA should normally be enabled.
-
-EXTRA_CFLAGS += -DSMBFS_PARANOIA
-#EXTRA_CFLAGS += -DSMBFS_DEBUG
-#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
-#EXTRA_CFLAGS += -Werror
-
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e344..000000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  cache.c
- *
- * Copyright (C) 1997 by Bill Hawes
- *
- * Routines to support directory cacheing using the page cache.
- * This cache code is almost directly taken from ncpfs.
- *
- * Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smb_fs.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-
-#include <asm/page.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-/*
- * Force the next attempt to use the cache to be a timeout.
- * If we can't find the page that's fine, it will cause a refresh.
- */
-void
-smb_invalid_dir_cache(struct inode * dir)
-{
-	struct smb_sb_info *server = server_from_inode(dir);
-	union  smb_dir_cache *cache = NULL;
-	struct page *page = NULL;
-
-	page = grab_cache_page(&dir->i_data, 0);
-	if (!page)
-		goto out;
-
-	if (!PageUptodate(page))
-		goto out_unlock;
-
-	cache = kmap(page);
-	cache->head.time = jiffies - SMB_MAX_AGE(server);
-
-	kunmap(page);
-	SetPageUptodate(page);
-out_unlock:
-	unlock_page(page);
-	page_cache_release(page);
-out:
-	return;
-}
-
-/*
- * Mark all dentries for 'parent' as invalid, forcing them to be re-read
- */
-void
-smb_invalidate_dircache_entries(struct dentry *parent)
-{
-	struct smb_sb_info *server = server_from_dentry(parent);
-	struct list_head *next;
-	struct dentry *dentry;
-
-	spin_lock(&dcache_lock);
-	next = parent->d_subdirs.next;
-	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_u.d_child);
-		dentry->d_fsdata = NULL;
-		smb_age_dentry(server, dentry);
-		next = next->next;
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * dget, but require that fpos and parent matches what the dentry contains.
- * dentry is not known to be a valid pointer at entry.
- */
-struct dentry *
-smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-	struct dentry *dent = dentry;
-	struct list_head *next;
-
-	if (d_validate(dent, parent)) {
-		if (dent->d_name.len <= SMB_MAXNAMELEN &&
-		    (unsigned long)dent->d_fsdata == fpos) {
-			if (!dent->d_inode) {
-				dput(dent);
-				dent = NULL;
-			}
-			return dent;
-		}
-		dput(dent);
-	}
-
-	/* If a pointer is invalid, we search the dentry. */
-	spin_lock(&dcache_lock);
-	next = parent->d_subdirs.next;
-	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_u.d_child);
-		if ((unsigned long)dent->d_fsdata == fpos) {
-			if (dent->d_inode)
-				dget_locked(dent);
-			else
-				dent = NULL;
-			goto out_unlock;
-		}
-		next = next->next;
-	}
-	dent = NULL;
-out_unlock:
-	spin_unlock(&dcache_lock);
-	return dent;
-}
-
-
-/*
- * Create dentry/inode for this file and add it to the dircache.
- */
-int
-smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	       struct smb_cache_control *ctrl, struct qstr *qname,
-	       struct smb_fattr *entry)
-{
-	struct dentry *newdent, *dentry = filp->f_path.dentry;
-	struct inode *newino, *inode = dentry->d_inode;
-	struct smb_cache_control ctl = *ctrl;
-	int valid = 0;
-	int hashed = 0;
-	ino_t ino = 0;
-
-	qname->hash = full_name_hash(qname->name, qname->len);
-
-	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, qname) != 0)
-			goto end_advance;
-
-	newdent = d_lookup(dentry, qname);
-
-	if (!newdent) {
-		newdent = d_alloc(dentry, qname);
-		if (!newdent)
-			goto end_advance;
-	} else {
-		hashed = 1;
-		memcpy((char *) newdent->d_name.name, qname->name,
-		       newdent->d_name.len);
-	}
-
-	if (!newdent->d_inode) {
-		smb_renew_times(newdent);
-		entry->f_ino = iunique(inode->i_sb, 2);
-		newino = smb_iget(inode->i_sb, entry);
-		if (newino) {
-			smb_new_dentry(newdent);
-			d_instantiate(newdent, newino);
-			if (!hashed)
-				d_rehash(newdent);
-		}
-	} else
-		smb_set_inode_attr(newdent->d_inode, entry);
-
-        if (newdent->d_inode) {
-		ino = newdent->d_inode->i_ino;
-		newdent->d_fsdata = (void *) ctl.fpos;
-		smb_new_dentry(newdent);
-	}
-
-	if (ctl.idx >= SMB_DIRCACHE_SIZE) {
-		if (ctl.page) {
-			kunmap(ctl.page);
-			SetPageUptodate(ctl.page);
-			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
-		}
-		ctl.cache = NULL;
-		ctl.idx  -= SMB_DIRCACHE_SIZE;
-		ctl.ofs  += 1;
-		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
-		if (ctl.page)
-			ctl.cache = kmap(ctl.page);
-	}
-	if (ctl.cache) {
-		ctl.cache->dentry[ctl.idx] = newdent;
-		valid = 1;
-	}
-	dput(newdent);
-
-end_advance:
-	if (!valid)
-		ctl.valid = 0;
-	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-		if (!ino)
-			ino = find_inode_number(dentry, qname);
-		if (!ino)
-			ino = iunique(inode->i_sb, 2);
-		ctl.filled = filldir(dirent, qname->name, qname->len,
-				     filp->f_pos, ino, DT_UNKNOWN);
-		if (!ctl.filled)
-			filp->f_pos += 1;
-	}
-	ctl.fpos += 1;
-	ctl.idx  += 1;
-	*ctrl = ctl;
-	return (ctl.valid || !ctl.filled);
-}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f36..000000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  dir.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/ctype.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <linux/smbno.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-static int smb_readdir(struct file *, void *, filldir_t);
-static int smb_dir_open(struct inode *, struct file *);
-
-static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int smb_mkdir(struct inode *, struct dentry *, int);
-static int smb_rmdir(struct inode *, struct dentry *);
-static int smb_unlink(struct inode *, struct dentry *);
-static int smb_rename(struct inode *, struct dentry *,
-		      struct inode *, struct dentry *);
-static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
-static int smb_link(struct dentry *, struct inode *, struct dentry *);
-
-const struct file_operations smb_dir_operations =
-{
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= smb_readdir,
-	.unlocked_ioctl	= smb_ioctl,
-	.open		= smb_dir_open,
-};
-
-const struct inode_operations smb_dir_inode_operations =
-{
-	.create		= smb_create,
-	.lookup		= smb_lookup,
-	.unlink		= smb_unlink,
-	.mkdir		= smb_mkdir,
-	.rmdir		= smb_rmdir,
-	.rename		= smb_rename,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-};
-
-const struct inode_operations smb_dir_inode_operations_unix =
-{
-	.create		= smb_create,
-	.lookup		= smb_lookup,
-	.unlink		= smb_unlink,
-	.mkdir		= smb_mkdir,
-	.rmdir		= smb_rmdir,
-	.rename		= smb_rename,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-	.symlink	= smb_symlink,
-	.mknod		= smb_make_node,
-	.link		= smb_link,
-};
-
-/*
- * Read a directory, using filldir to fill the dirent memory.
- * smb_proc_readdir does the actual reading from the smb server.
- *
- * The cache code is almost directly taken from ncpfs
- */
-static int 
-smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *dir = dentry->d_inode;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	union  smb_dir_cache *cache = NULL;
-	struct smb_cache_control ctl;
-	struct page *page = NULL;
-	int result;
-
-	ctl.page  = NULL;
-	ctl.cache = NULL;
-
-	VERBOSE("reading %s/%s, f_pos=%d\n",
-		DENTRY_PATH(dentry),  (int) filp->f_pos);
-
-	result = 0;
-
-	lock_kernel();
-
-	switch ((unsigned int) filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = 1;
-		/* fallthrough */
-	case 1:
-		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
-			goto out;
-		filp->f_pos = 2;
-	}
-
-	/*
-	 * Make sure our inode is up-to-date.
-	 */
-	result = smb_revalidate_inode(dentry);
-	if (result)
-		goto out;
-
-
-	page = grab_cache_page(&dir->i_data, 0);
-	if (!page)
-		goto read_really;
-
-	ctl.cache = cache = kmap(page);
-	ctl.head  = cache->head;
-
-	if (!PageUptodate(page) || !ctl.head.eof) {
-		VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
-			 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
-		goto init_cache;
-	}
-
-	if (filp->f_pos == 2) {
-		if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
-			goto init_cache;
-
-		/*
-		 * N.B. ncpfs checks mtime of dentry too here, we don't.
-		 *   1. common smb servers do not update mtime on dir changes
-		 *   2. it requires an extra smb request
-		 *      (revalidate has the same timeout as ctl.head.time)
-		 *
-		 * Instead smbfs invalidates its own cache on local changes
-		 * and remote changes are not seen until timeout.
-		 */
-	}
-
-	if (filp->f_pos > ctl.head.end)
-		goto finished;
-
-	ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
-	ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
-	ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
-
-	for (;;) {
-		if (ctl.ofs != 0) {
-			ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
-			if (!ctl.page)
-				goto invalid_cache;
-			ctl.cache = kmap(ctl.page);
-			if (!PageUptodate(ctl.page))
-				goto invalid_cache;
-		}
-		while (ctl.idx < SMB_DIRCACHE_SIZE) {
-			struct dentry *dent;
-			int res;
-
-			dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
-					     dentry, filp->f_pos);
-			if (!dent)
-				goto invalid_cache;
-
-			res = filldir(dirent, dent->d_name.name,
-				      dent->d_name.len, filp->f_pos,
-				      dent->d_inode->i_ino, DT_UNKNOWN);
-			dput(dent);
-			if (res)
-				goto finished;
-			filp->f_pos += 1;
-			ctl.idx += 1;
-			if (filp->f_pos > ctl.head.end)
-				goto finished;
-		}
-		if (ctl.page) {
-			kunmap(ctl.page);
-			SetPageUptodate(ctl.page);
-			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
-			ctl.page = NULL;
-		}
-		ctl.idx  = 0;
-		ctl.ofs += 1;
-	}
-invalid_cache:
-	if (ctl.page) {
-		kunmap(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-		ctl.page = NULL;
-	}
-	ctl.cache = cache;
-init_cache:
-	smb_invalidate_dircache_entries(dentry);
-	ctl.head.time = jiffies;
-	ctl.head.eof = 0;
-	ctl.fpos = 2;
-	ctl.ofs = 0;
-	ctl.idx = SMB_DIRCACHE_START;
-	ctl.filled = 0;
-	ctl.valid  = 1;
-read_really:
-	result = server->ops->readdir(filp, dirent, filldir, &ctl);
-	if (result == -ERESTARTSYS && page)
-		ClearPageUptodate(page);
-	if (ctl.idx == -1)
-		goto invalid_cache;	/* retry */
-	ctl.head.end = ctl.fpos - 1;
-	ctl.head.eof = ctl.valid;
-finished:
-	if (page) {
-		cache->head = ctl.head;
-		kunmap(page);
-		if (result != -ERESTARTSYS)
-			SetPageUptodate(page);
-		unlock_page(page);
-		page_cache_release(page);
-	}
-	if (ctl.page) {
-		kunmap(ctl.page);
-		SetPageUptodate(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-	}
-out:
-	unlock_kernel();
-	return result;
-}
-
-static int
-smb_dir_open(struct inode *dir, struct file *file)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct smb_sb_info *server;
-	int error = 0;
-
-	VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name);
-
-	/*
-	 * Directory timestamps in the core protocol aren't updated
-	 * when a file is added, so we give them a very short TTL.
-	 */
-	lock_kernel();
-	server = server_from_dentry(dentry);
-	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
-		unsigned long age = jiffies - SMB_I(dir)->oldmtime;
-		if (age > 2*HZ)
-			smb_invalid_dir_cache(dir);
-	}
-
-	/*
-	 * Note: in order to allow the smbmount process to open the
-	 * mount point, we only revalidate if the connection is valid or
-	 * if the process is trying to access something other than the root.
-	 */
-	if (server->state == CONN_VALID || !IS_ROOT(dentry))
-		error = smb_revalidate_inode(dentry);
-	unlock_kernel();
-	return error;
-}
-
-/*
- * Dentry operations routines
- */
-static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
-
-static const struct dentry_operations smbfs_dentry_operations =
-{
-	.d_revalidate	= smb_lookup_validate,
-	.d_hash		= smb_hash_dentry,
-	.d_compare	= smb_compare_dentry,
-	.d_delete	= smb_delete_dentry,
-};
-
-static const struct dentry_operations smbfs_dentry_operations_case =
-{
-	.d_revalidate	= smb_lookup_validate,
-	.d_delete	= smb_delete_dentry,
-};
-
-
-/*
- * This is the callback when the dcache has a lookup hit.
- */
-static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode * inode = dentry->d_inode;
-	unsigned long age = jiffies - dentry->d_time;
-	int valid;
-
-	/*
-	 * The default validation is based on dentry age:
-	 * we believe in dentries for a few seconds.  (But each
-	 * successful server lookup renews the timestamp.)
-	 */
-	valid = (age <= SMB_MAX_AGE(server));
-#ifdef SMBFS_DEBUG_VERBOSE
-	if (!valid)
-		VERBOSE("%s/%s not valid, age=%lu\n", 
-			DENTRY_PATH(dentry), age);
-#endif
-
-	if (inode) {
-		lock_kernel();
-		if (is_bad_inode(inode)) {
-			PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
-			valid = 0;
-		} else if (!valid)
-			valid = (smb_revalidate_inode(dentry) == 0);
-		unlock_kernel();
-	} else {
-		/*
-		 * What should we do for negative dentries?
-		 */
-	}
-	return valid;
-}
-
-static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
-{
-	unsigned long hash;
-	int i;
-
-	hash = init_name_hash();
-	for (i=0; i < this->len ; i++)
-		hash = partial_name_hash(tolower(this->name[i]), hash);
-	this->hash = end_name_hash(hash);
-  
-	return 0;
-}
-
-static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
-{
-	int i, result = 1;
-
-	if (a->len != b->len)
-		goto out;
-	for (i=0; i < a->len; i++) {
-		if (tolower(a->name[i]) != tolower(b->name[i]))
-			goto out;
-	}
-	result = 0;
-out:
-	return result;
-}
-
-/*
- * This is the callback from dput() when d_count is going to 0.
- * We use this to unhash dentries with bad inodes.
- */
-static int
-smb_delete_dentry(struct dentry * dentry)
-{
-	if (dentry->d_inode) {
-		if (is_bad_inode(dentry->d_inode)) {
-			PARANOIA("bad inode, unhashing %s/%s\n",
-				 DENTRY_PATH(dentry));
-			return 1;
-		}
-	} else {
-		/* N.B. Unhash negative dentries? */
-	}
-	return 0;
-}
-
-/*
- * Initialize a new dentry
- */
-void
-smb_new_dentry(struct dentry *dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-
-	if (server->mnt->flags & SMB_MOUNT_CASE)
-		dentry->d_op = &smbfs_dentry_operations_case;
-	else
-		dentry->d_op = &smbfs_dentry_operations;
-	dentry->d_time = jiffies;
-}
-
-
-/*
- * Whenever a lookup succeeds, we know the parent directories
- * are all valid, so we want to update the dentry timestamps.
- * N.B. Move this to dcache?
- */
-void
-smb_renew_times(struct dentry * dentry)
-{
-	dget(dentry);
-	spin_lock(&dentry->d_lock);
-	for (;;) {
-		struct dentry *parent;
-
-		dentry->d_time = jiffies;
-		if (IS_ROOT(dentry))
-			break;
-		parent = dentry->d_parent;
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		dput(dentry);
-		dentry = parent;
-		spin_lock(&dentry->d_lock);
-	}
-	spin_unlock(&dentry->d_lock);
-	dput(dentry);
-}
-
-static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	struct smb_fattr finfo;
-	struct inode *inode;
-	int error;
-	struct smb_sb_info *server;
-
-	error = -ENAMETOOLONG;
-	if (dentry->d_name.len > SMB_MAXNAMELEN)
-		goto out;
-
-	/* Do not allow lookup of names with backslashes in */
-	error = -EINVAL;
-	if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
-		goto out;
-
-	lock_kernel();
-	error = smb_proc_getattr(dentry, &finfo);
-#ifdef SMBFS_PARANOIA
-	if (error && error != -ENOENT)
-		PARANOIA("find %s/%s failed, error=%d\n",
-			 DENTRY_PATH(dentry), error);
-#endif
-
-	inode = NULL;
-	if (error == -ENOENT)
-		goto add_entry;
-	if (!error) {
-		error = -EACCES;
-		finfo.f_ino = iunique(dentry->d_sb, 2);
-		inode = smb_iget(dir->i_sb, &finfo);
-		if (inode) {
-	add_entry:
-			server = server_from_dentry(dentry);
-			if (server->mnt->flags & SMB_MOUNT_CASE)
-				dentry->d_op = &smbfs_dentry_operations_case;
-			else
-				dentry->d_op = &smbfs_dentry_operations;
-
-			d_add(dentry, inode);
-			smb_renew_times(dentry);
-			error = 0;
-		}
-	}
-	unlock_kernel();
-out:
-	return ERR_PTR(error);
-}
-
-/*
- * This code is common to all routines creating a new inode.
- */
-static int
-smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode *inode;
-	int error;
-	struct smb_fattr fattr;
-
-	VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
-
-	error = smb_proc_getattr(dentry, &fattr);
-	if (error)
-		goto out_close;
-
-	smb_renew_times(dentry);
-	fattr.f_ino = iunique(dentry->d_sb, 2);
-	inode = smb_iget(dentry->d_sb, &fattr);
-	if (!inode)
-		goto out_no_inode;
-
-	if (have_id) {
-		struct smb_inode_info *ei = SMB_I(inode);
-		ei->fileid = fileid;
-		ei->access = SMB_O_RDWR;
-		ei->open = server->generation;
-	}
-	d_instantiate(dentry, inode);
-out:
-	return error;
-
-out_no_inode:
-	error = -EACCES;
-out_close:
-	if (have_id) {
-		PARANOIA("%s/%s failed, error=%d, closing %u\n",
-			 DENTRY_PATH(dentry), error, fileid);
-		smb_close_fileid(dentry, fileid);
-	}
-	goto out;
-}
-
-/* N.B. How should the mode argument be used? */
-static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	__u16 fileid;
-	int error;
-	struct iattr attr;
-
-	VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
-
-	lock_kernel();
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
-	if (!error) {
-		if (server->opt.capabilities & SMB_CAP_UNIX) {
-			/* Set attributes for new file */
-			attr.ia_valid = ATTR_MODE;
-			attr.ia_mode = mode;
-			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-		}
-		error = smb_instantiate(dentry, fileid, 1);
-	} else {
-		PARANOIA("%s/%s failed, error=%d\n",
-			 DENTRY_PATH(dentry), error);
-	}
-	unlock_kernel();
-	return error;
-}
-
-/* N.B. How should the mode argument be used? */
-static int
-smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int error;
-	struct iattr attr;
-
-	lock_kernel();
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_mkdir(dentry);
-	if (!error) {
-		if (server->opt.capabilities & SMB_CAP_UNIX) {
-			/* Set attributes for new directory */
-			attr.ia_valid = ATTR_MODE;
-			attr.ia_mode = mode;
-			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-		}
-		error = smb_instantiate(dentry, 0, 0);
-	}
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int error;
-
-	/*
-	 * Close the directory if it's open.
-	 */
-	lock_kernel();
-	smb_close(inode);
-
-	/*
-	 * Check that nobody else is using the directory..
-	 */
-	error = -EBUSY;
-	if (!d_unhashed(dentry))
-		goto out;
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_rmdir(dentry);
-
-out:
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	/*
-	 * Close the file if it's open.
-	 */
-	lock_kernel();
-	smb_close(dentry->d_inode);
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_unlink(dentry);
-	if (!error)
-		smb_renew_times(dentry);
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_rename(struct inode *old_dir, struct dentry *old_dentry,
-	   struct inode *new_dir, struct dentry *new_dentry)
-{
-	int error;
-
-	/*
-	 * Close any open files, and check whether to delete the
-	 * target before attempting the rename.
-	 */
-	lock_kernel();
-	if (old_dentry->d_inode)
-		smb_close(old_dentry->d_inode);
-	if (new_dentry->d_inode) {
-		smb_close(new_dentry->d_inode);
-		error = smb_proc_unlink(new_dentry);
-		if (error) {
-			VERBOSE("unlink %s/%s, error=%d\n",
-				DENTRY_PATH(new_dentry), error);
-			goto out;
-		}
-		/* FIXME */
-		d_delete(new_dentry);
-	}
-
-	smb_invalid_dir_cache(old_dir);
-	smb_invalid_dir_cache(new_dir);
-	error = smb_proc_mv(old_dentry, new_dentry);
-	if (!error) {
-		smb_renew_times(old_dentry);
-		smb_renew_times(new_dentry);
-	}
-out:
-	unlock_kernel();
-	return error;
-}
-
-/*
- * FIXME: samba servers won't let you create device nodes unless uid/gid
- * matches the connection credentials (and we don't know which those are ...)
- */
-static int
-smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-	int error;
-	struct iattr attr;
-
-	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
-	attr.ia_mode = mode;
-	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-
-	if (!new_valid_dev(dev))
-		return -EINVAL;
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
-	if (!error) {
-		error = smb_instantiate(dentry, 0, 0);
-	}
-	return error;
-}
-
-/*
- * dentry = existing file
- * new_dentry = new file
- */
-static int
-smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
-{
-	int error;
-
-	DEBUG1("smb_link old=%s/%s new=%s/%s\n",
-	       DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
-	if (!error) {
-		smb_renew_times(dentry);
-		error = smb_instantiate(new_dentry, 0, 0);
-	}
-	return error;
-}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94bb..000000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  file.c
- *
- *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/aio.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-static int
-smb_fsync(struct file *file, int datasync)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int result;
-
-	VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
-
-	/*
-	 * The VFS will writepage() all dirty pages for us, but we
-	 * should send a SMBflush to the server, letting it know that
-	 * we want things synchronized with actual storage.
-	 *
-	 * Note: this function requires all pages to have been written already
-	 *       (should be ok with writepage_sync)
-	 */
-	result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
-	return result;
-}
-
-/*
- * Read a page synchronously.
- */
-static int
-smb_readpage_sync(struct dentry *dentry, struct page *page)
-{
-	char *buffer = kmap(page);
-	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	unsigned int rsize = smb_get_rsize(server);
-	int count = PAGE_SIZE;
-	int result;
-
-	VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
-		DENTRY_PATH(dentry), count, offset, rsize);
-
-	result = smb_open(dentry, SMB_O_RDONLY);
-	if (result < 0)
-		goto io_error;
-
-	do {
-		if (count < rsize)
-			rsize = count;
-
-		result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
-		if (result < 0)
-			goto io_error;
-
-		count -= result;
-		offset += result;
-		buffer += result;
-		dentry->d_inode->i_atime =
-			current_fs_time(dentry->d_inode->i_sb);
-		if (result < rsize)
-			break;
-	} while (count);
-
-	memset(buffer, 0, count);
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	result = 0;
-
-io_error:
-	kunmap(page);
-	unlock_page(page);
-	return result;
-}
-
-/*
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_readpage(struct file *file, struct page *page)
-{
-	int		error;
-	struct dentry  *dentry = file->f_path.dentry;
-
-	page_cache_get(page);
-	error = smb_readpage_sync(dentry, page);
-	page_cache_release(page);
-	return error;
-}
-
-/*
- * Write a page synchronously.
- * Offset is the data offset within the page.
- */
-static int
-smb_writepage_sync(struct inode *inode, struct page *page,
-		   unsigned long pageoffset, unsigned int count)
-{
-	loff_t offset;
-	char *buffer = kmap(page) + pageoffset;
-	struct smb_sb_info *server = server_from_inode(inode);
-	unsigned int wsize = smb_get_wsize(server);
-	int ret = 0;
-
-	offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
-	VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
-
-	do {
-		int write_ret;
-
-		if (count < wsize)
-			wsize = count;
-
-		write_ret = server->ops->write(inode, offset, wsize, buffer);
-		if (write_ret < 0) {
-			PARANOIA("failed write, wsize=%d, write_ret=%d\n",
-				 wsize, write_ret);
-			ret = write_ret;
-			break;
-		}
-		/* N.B. what if result < wsize?? */
-#ifdef SMBFS_PARANOIA
-		if (write_ret < wsize)
-			PARANOIA("short write, wsize=%d, write_ret=%d\n",
-				 wsize, write_ret);
-#endif
-		buffer += wsize;
-		offset += wsize;
-		count -= wsize;
-		/*
-		 * Update the inode now rather than waiting for a refresh.
-		 */
-		inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
-		SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
-		if (offset > inode->i_size)
-			inode->i_size = offset;
-	} while (count);
-
-	kunmap(page);
-	return ret;
-}
-
-/*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
- *
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode;
-	unsigned long end_index;
-	unsigned offset = PAGE_CACHE_SIZE;
-	int err;
-
-	BUG_ON(!mapping);
-	inode = mapping->host;
-	BUG_ON(!inode);
-
-	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-
-	/* easy case */
-	if (page->index < end_index)
-		goto do_it;
-	/* things got complicated... */
-	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-	/* OK, are we completely out? */
-	if (page->index >= end_index+1 || !offset)
-		return 0; /* truncated - don't care */
-do_it:
-	page_cache_get(page);
-	err = smb_writepage_sync(inode, page, 0, offset);
-	SetPageUptodate(page);
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
-static int
-smb_updatepage(struct file *file, struct page *page, unsigned long offset,
-	       unsigned int count)
-{
-	struct dentry *dentry = file->f_path.dentry;
-
-	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
-		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
-
-	return smb_writepage_sync(dentry->d_inode, page, offset, count);
-}
-
-static ssize_t
-smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	struct file * file = iocb->ki_filp;
-	struct dentry * dentry = file->f_path.dentry;
-	ssize_t	status;
-
-	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-		(unsigned long) iocb->ki_left, (unsigned long) pos);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-
-	VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
-		(long)dentry->d_inode->i_size,
-		dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
-
-	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
-out:
-	return status;
-}
-
-static int
-smb_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	struct dentry * dentry = file->f_path.dentry;
-	int	status;
-
-	VERBOSE("file %s/%s, address %lu - %lu\n",
-		DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%d\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-	status = generic_file_mmap(file, vma);
-out:
-	return status;
-}
-
-static ssize_t
-smb_file_splice_read(struct file *file, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t count,
-		     unsigned int flags)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	ssize_t status;
-
-	VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
-		DENTRY_PATH(dentry), *ppos, count);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-	status = generic_file_splice_read(file, ppos, pipe, count, flags);
-out:
-	return status;
-}
-
-/*
- * This does the "real" work of the write. The generic routine has
- * allocated the page, locked it, done all the page alignment stuff
- * calculations etc. Now we should just copy the data from user
- * space and write it back to the real medium..
- *
- * If the writer ends up delaying the write, the writer needs to
- * increment the page use counts until he is done with the page.
- */
-static int smb_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
-{
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	*pagep = grab_cache_page_write_begin(mapping, index, flags);
-	if (!*pagep)
-		return -ENOMEM;
-	return 0;
-}
-
-static int smb_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
-{
-	int status;
-	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-
-	lock_kernel();
-	status = smb_updatepage(file, page, offset, copied);
-	unlock_kernel();
-
-	if (!status) {
-		if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-			SetPageUptodate(page);
-		status = copied;
-	}
-
-	unlock_page(page);
-	page_cache_release(page);
-
-	return status;
-}
-
-const struct address_space_operations smb_file_aops = {
-	.readpage = smb_readpage,
-	.writepage = smb_writepage,
-	.write_begin = smb_write_begin,
-	.write_end = smb_write_end,
-};
-
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos)
-{
-	struct file * file = iocb->ki_filp;
-	struct dentry * dentry = file->f_path.dentry;
-	ssize_t	result;
-
-	VERBOSE("file %s/%s, count=%lu@%lu\n",
-		DENTRY_PATH(dentry),
-		(unsigned long) iocb->ki_left, (unsigned long) pos);
-
-	result = smb_revalidate_inode(dentry);
-	if (result) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), result);
-		goto out;
-	}
-
-	result = smb_open(dentry, SMB_O_WRONLY);
-	if (result)
-		goto out;
-
-	if (iocb->ki_left > 0) {
-		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-			(long) file->f_pos, (long) dentry->d_inode->i_size,
-			dentry->d_inode->i_mtime.tv_sec,
-			dentry->d_inode->i_atime.tv_sec);
-	}
-out:
-	return result;
-}
-
-static int
-smb_file_open(struct inode *inode, struct file * file)
-{
-	int result;
-	struct dentry *dentry = file->f_path.dentry;
-	int smb_mode = (file->f_mode & O_ACCMODE) - 1;
-
-	lock_kernel();
-	result = smb_open(dentry, smb_mode);
-	if (result)
-		goto out;
-	SMB_I(inode)->openers++;
-out:
-	unlock_kernel();
-	return result;
-}
-
-static int
-smb_file_release(struct inode *inode, struct file * file)
-{
-	lock_kernel();
-	if (!--SMB_I(inode)->openers) {
-		/* We must flush any dirty pages now as we won't be able to
-		   write anything after close. mmap can trigger this.
-		   "openers" should perhaps include mmap'ers ... */
-		filemap_write_and_wait(inode->i_mapping);
-		smb_close(inode);
-	}
-	unlock_kernel();
-	return 0;
-}
-
-/*
- * Check whether the required access is compatible with
- * an inode's permission. SMB doesn't recognize superuser
- * privileges, so we need our own check for this.
- */
-static int
-smb_file_permission(struct inode *inode, int mask)
-{
-	int mode = inode->i_mode;
-	int error = 0;
-
-	VERBOSE("mode=%x, mask=%x\n", mode, mask);
-
-	/* Look at user permissions */
-	mode >>= 6;
-	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
-		error = -EACCES;
-	return error;
-}
-
-static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t ret;
-	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
-	unlock_kernel();
-	return ret;
-}
-
-const struct file_operations smb_file_operations =
-{
-	.llseek 	= smb_remote_llseek,
-	.read		= do_sync_read,
-	.aio_read	= smb_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= smb_file_aio_write,
-	.unlocked_ioctl	= smb_ioctl,
-	.mmap		= smb_file_mmap,
-	.open		= smb_file_open,
-	.release	= smb_file_release,
-	.fsync		= smb_fsync,
-	.splice_read	= smb_file_splice_read,
-};
-
-const struct inode_operations smb_file_inode_operations =
-{
-	.permission	= smb_file_permission,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab1..000000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * getopt.c
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/net.h>
-
-#include "getopt.h"
-
-/**
- *	smb_getopt - option parser
- *	@caller: name of the caller, for error messages
- *	@options: the options string
- *	@opts: an array of &struct option entries controlling parser operations
- *	@optopt: output; will contain the current option
- *	@optarg: output; will contain the value (if one exists)
- *	@flag: output; may be NULL; should point to a long for or'ing flags
- *	@value: output; may be NULL; will be overwritten with the integer value
- *		of the current argument.
- *
- *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
- *	Returns opts->val if a matching entry in the 'opts' array is found,
- *	0 when no more tokens are found, -1 if an error is encountered.
- */
-int smb_getopt(char *caller, char **options, struct option *opts,
-	       char **optopt, char **optarg, unsigned long *flag,
-	       unsigned long *value)
-{
-	char *token;
-	char *val;
-	int i;
-
-	do {
-		if ((token = strsep(options, ",")) == NULL)
-			return 0;
-	} while (*token == '\0');
-	*optopt = token;
-
-	*optarg = NULL;
-	if ((val = strchr (token, '=')) != NULL) {
-		*val++ = 0;
-		if (value)
-			*value = simple_strtoul(val, NULL, 0);
-		*optarg = val;
-	}
-
-	for (i = 0; opts[i].name != NULL; i++) {
-		if (!strcmp(opts[i].name, token)) {
-			if (!opts[i].flag && (!val || !*val)) {
-				printk("%s: the %s option requires an argument\n",
-				       caller, token);
-				return -1;
-			}
-
-			if (flag && opts[i].flag)
-				*flag |= opts[i].flag;
-
-			return opts[i].val;
-		}
-	}
-	printk("%s: Unrecognized mount option %s\n", caller, token);
-	return -1;
-}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c46..000000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_GETOPT_H
-#define _LINUX_GETOPT_H
-
-struct option {
-	const char *name;
-	unsigned long flag;
-	int val;
-};
-
-extern int smb_getopt(char *caller, char **options, struct option *opts,
-		      char **optopt, char **optarg, unsigned long *flag,
-		      unsigned long *value);
-
-#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 450c91941988..000000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- *  inode.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-#include <linux/nls.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/highuid.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include "smb_debug.h"
-#include "getopt.h"
-#include "proto.h"
-
-/* Always pick a default string */
-#ifdef CONFIG_SMB_NLS_REMOTE
-#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
-#else
-#define SMB_NLS_REMOTE ""
-#endif
-
-#define SMB_TTL_DEFAULT 1000
-
-static void smb_evict_inode(struct inode *);
-static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct dentry *, struct kstatfs *);
-static int  smb_show_options(struct seq_file *, struct vfsmount *);
-
-static struct kmem_cache *smb_inode_cachep;
-
-static struct inode *smb_alloc_inode(struct super_block *sb)
-{
-	struct smb_inode_info *ei;
-	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void smb_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
-	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
-					     sizeof(struct smb_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (smb_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	kmem_cache_destroy(smb_inode_cachep);
-}
-
-static int smb_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_NODIRATIME;
-	return 0;
-}
-
-static const struct super_operations smb_sops =
-{
-	.alloc_inode	= smb_alloc_inode,
-	.destroy_inode	= smb_destroy_inode,
-	.drop_inode	= generic_delete_inode,
-	.evict_inode	= smb_evict_inode,
-	.put_super	= smb_put_super,
-	.statfs		= smb_statfs,
-	.show_options	= smb_show_options,
-	.remount_fs	= smb_remount,
-};
-
-
-/* We are always generating a new inode here */
-struct inode *
-smb_iget(struct super_block *sb, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = SMB_SB(sb);
-	struct inode *result;
-
-	DEBUG1("smb_iget: %p\n", fattr);
-
-	result = new_inode(sb);
-	if (!result)
-		return result;
-	result->i_ino = fattr->f_ino;
-	SMB_I(result)->open = 0;
-	SMB_I(result)->fileid = 0;
-	SMB_I(result)->access = 0;
-	SMB_I(result)->flags = 0;
-	SMB_I(result)->closed = 0;
-	SMB_I(result)->openers = 0;
-	smb_set_inode_attr(result, fattr);
-	if (S_ISREG(result->i_mode)) {
-		result->i_op = &smb_file_inode_operations;
-		result->i_fop = &smb_file_operations;
-		result->i_data.a_ops = &smb_file_aops;
-	} else if (S_ISDIR(result->i_mode)) {
-		if (server->opt.capabilities & SMB_CAP_UNIX)
-			result->i_op = &smb_dir_inode_operations_unix;
-		else
-			result->i_op = &smb_dir_inode_operations;
-		result->i_fop = &smb_dir_operations;
-	} else if (S_ISLNK(result->i_mode)) {
-		result->i_op = &smb_link_inode_operations;
-	} else {
-		init_special_inode(result, result->i_mode, fattr->f_rdev);
-	}
-	insert_inode_hash(result);
-	return result;
-}
-
-/*
- * Copy the inode data to a smb_fattr structure.
- */
-void
-smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-	memset(fattr, 0, sizeof(struct smb_fattr));
-	fattr->f_mode	= inode->i_mode;
-	fattr->f_nlink	= inode->i_nlink;
-	fattr->f_ino	= inode->i_ino;
-	fattr->f_uid	= inode->i_uid;
-	fattr->f_gid	= inode->i_gid;
-	fattr->f_size	= inode->i_size;
-	fattr->f_mtime	= inode->i_mtime;
-	fattr->f_ctime	= inode->i_ctime;
-	fattr->f_atime	= inode->i_atime;
-	fattr->f_blocks	= inode->i_blocks;
-
-	fattr->attr	= SMB_I(inode)->attr;
-	/*
-	 * Keep the attributes in sync with the inode permissions.
-	 */
-	if (fattr->f_mode & S_IWUSR)
-		fattr->attr &= ~aRONLY;
-	else
-		fattr->attr |= aRONLY;
-}
-
-/*
- * Update the inode, possibly causing it to invalidate its pages if mtime/size
- * is different from last time.
- */
-void
-smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-	struct smb_inode_info *ei = SMB_I(inode);
-
-	/*
-	 * A size change should have a different mtime, or same mtime
-	 * but different size.
-	 */
-	time_t last_time = inode->i_mtime.tv_sec;
-	loff_t last_sz = inode->i_size;
-
-	inode->i_mode	= fattr->f_mode;
-	inode->i_nlink	= fattr->f_nlink;
-	inode->i_uid	= fattr->f_uid;
-	inode->i_gid	= fattr->f_gid;
-	inode->i_ctime	= fattr->f_ctime;
-	inode->i_blocks = fattr->f_blocks;
-	inode->i_size	= fattr->f_size;
-	inode->i_mtime	= fattr->f_mtime;
-	inode->i_atime	= fattr->f_atime;
-	ei->attr = fattr->attr;
-
-	/*
-	 * Update the "last time refreshed" field for revalidation.
-	 */
-	ei->oldmtime = jiffies;
-
-	if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
-		VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
-			inode->i_ino,
-			(long) last_time, (long) inode->i_mtime.tv_sec,
-			(long) last_sz, (long) inode->i_size);
-
-		if (!S_ISDIR(inode->i_mode))
-			invalidate_remote_inode(inode);
-	}
-}
-
-/*
- * This is called if the connection has gone bad ...
- * try to kill off all the current inodes.
- */
-void
-smb_invalidate_inodes(struct smb_sb_info *server)
-{
-	VERBOSE("\n");
-	shrink_dcache_sb(SB_of(server));
-	invalidate_inodes(SB_of(server));
-}
-
-/*
- * This is called to update the inode attributes after
- * we've made changes to a file or directory.
- */
-static int
-smb_refresh_inode(struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int error;
-	struct smb_fattr fattr;
-
-	error = smb_proc_getattr(dentry, &fattr);
-	if (!error) {
-		smb_renew_times(dentry);
-		/*
-		 * Check whether the type part of the mode changed,
-		 * and don't update the attributes if it did.
-		 *
-		 * And don't dick with the root inode
-		 */
-		if (inode->i_ino == 2)
-			return error;
-		if (S_ISLNK(inode->i_mode))
-			return error;	/* VFS will deal with it */
-
-		if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
-			smb_set_inode_attr(inode, &fattr);
-		} else {
-			/*
-			 * Big trouble! The inode has become a new object,
-			 * so any operations attempted on it are invalid.
-			 *
-			 * To limit damage, mark the inode as bad so that
-			 * subsequent lookup validations will fail.
-			 */
-			PARANOIA("%s/%s changed mode, %07o to %07o\n",
-				 DENTRY_PATH(dentry),
-				 inode->i_mode, fattr.f_mode);
-
-			fattr.f_mode = inode->i_mode; /* save mode */
-			make_bad_inode(inode);
-			inode->i_mode = fattr.f_mode; /* restore mode */
-			/*
-			 * No need to worry about unhashing the dentry: the
-			 * lookup validation will see that the inode is bad.
-			 * But we do want to invalidate the caches ...
-			 */
-			if (!S_ISDIR(inode->i_mode))
-				invalidate_remote_inode(inode);
-			else
-				smb_invalid_dir_cache(inode);
-			error = -EIO;
-		}
-	}
-	return error;
-}
-
-/*
- * This is called when we want to check whether the inode
- * has changed on the server.  If it has changed, we must
- * invalidate our local caches.
- */
-int
-smb_revalidate_inode(struct dentry *dentry)
-{
-	struct smb_sb_info *s = server_from_dentry(dentry);
-	struct inode *inode = dentry->d_inode;
-	int error = 0;
-
-	DEBUG1("smb_revalidate_inode\n");
-	lock_kernel();
-
-	/*
-	 * Check whether we've recently refreshed the inode.
-	 */
-	if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
-		VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
-			inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
-		goto out;
-	}
-
-	error = smb_refresh_inode(dentry);
-out:
-	unlock_kernel();
-	return error;
-}
-
-/*
- * This routine is called when i_nlink == 0 and i_count goes to 0.
- * All blocking cleanup operations need to go here to avoid races.
- */
-static void
-smb_evict_inode(struct inode *ino)
-{
-	DEBUG1("ino=%ld\n", ino->i_ino);
-	truncate_inode_pages(&ino->i_data, 0);
-	end_writeback(ino);
-	lock_kernel();
-	if (smb_close(ino))
-		PARANOIA("could not close inode %ld\n", ino->i_ino);
-	unlock_kernel();
-}
-
-static struct option opts[] = {
-	{ "version",	0, 'v' },
-	{ "win95",	SMB_MOUNT_WIN95, 1 },
-	{ "oldattr",	SMB_MOUNT_OLDATTR, 1 },
-	{ "dirattr",	SMB_MOUNT_DIRATTR, 1 },
-	{ "case",	SMB_MOUNT_CASE, 1 },
-	{ "uid",	0, 'u' },
-	{ "gid",	0, 'g' },
-	{ "file_mode",	0, 'f' },
-	{ "dir_mode",	0, 'd' },
-	{ "iocharset",	0, 'i' },
-	{ "codepage",	0, 'c' },
-	{ "ttl",	0, 't' },
-	{ NULL,		0, 0}
-};
-
-static int
-parse_options(struct smb_mount_data_kernel *mnt, char *options)
-{
-	int c;
-	unsigned long flags;
-	unsigned long value;
-	char *optarg;
-	char *optopt;
-
-	flags = 0;
-	while ( (c = smb_getopt("smbfs", &options, opts,
-				&optopt, &optarg, &flags, &value)) > 0) {
-
-		VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-		switch (c) {
-		case 1:
-			/* got a "flag" option */
-			break;
-		case 'v':
-			if (value != SMB_MOUNT_VERSION) {
-			printk ("smbfs: Bad mount version %ld, expected %d\n",
-				value, SMB_MOUNT_VERSION);
-				return 0;
-			}
-			mnt->version = value;
-			break;
-		case 'u':
-			mnt->uid = value;
-			flags |= SMB_MOUNT_UID;
-			break;
-		case 'g':
-			mnt->gid = value;
-			flags |= SMB_MOUNT_GID;
-			break;
-		case 'f':
-			mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
-			flags |= SMB_MOUNT_FMODE;
-			break;
-		case 'd':
-			mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
-			flags |= SMB_MOUNT_DMODE;
-			break;
-		case 'i':
-			strlcpy(mnt->codepage.local_name, optarg, 
-				SMB_NLS_MAXNAMELEN);
-			break;
-		case 'c':
-			strlcpy(mnt->codepage.remote_name, optarg,
-				SMB_NLS_MAXNAMELEN);
-			break;
-		case 't':
-			mnt->ttl = value;
-			break;
-		default:
-			printk ("smbfs: Unrecognized mount option %s\n",
-				optopt);
-			return -1;
-		}
-	}
-	mnt->flags = flags;
-	return c;
-}
-
-/*
- * smb_show_options() is for displaying mount options in /proc/mounts.
- * It tries to avoid showing settings that were not changed from their
- * defaults.
- */
-static int
-smb_show_options(struct seq_file *s, struct vfsmount *m)
-{
-	struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
-	int i;
-
-	for (i = 0; opts[i].name != NULL; i++)
-		if (mnt->flags & opts[i].flag)
-			seq_printf(s, ",%s", opts[i].name);
-
-	if (mnt->flags & SMB_MOUNT_UID)
-		seq_printf(s, ",uid=%d", mnt->uid);
-	if (mnt->flags & SMB_MOUNT_GID)
-		seq_printf(s, ",gid=%d", mnt->gid);
-	if (mnt->mounted_uid != 0)
-		seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
-
-	/* 
-	 * Defaults for file_mode and dir_mode are unknown to us; they
-	 * depend on the current umask of the user doing the mount.
-	 */
-	if (mnt->flags & SMB_MOUNT_FMODE)
-		seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-	if (mnt->flags & SMB_MOUNT_DMODE)
-		seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
-
-	if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
-		seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
-	if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
-		seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
-
-	if (mnt->ttl != SMB_TTL_DEFAULT)
-		seq_printf(s, ",ttl=%d", mnt->ttl);
-
-	return 0;
-}
-
-static void
-smb_unload_nls(struct smb_sb_info *server)
-{
-	unload_nls(server->remote_nls);
-	unload_nls(server->local_nls);
-}
-
-static void
-smb_put_super(struct super_block *sb)
-{
-	struct smb_sb_info *server = SMB_SB(sb);
-
-	lock_kernel();
-
-	smb_lock_server(server);
-	server->state = CONN_INVALID;
-	smbiod_unregister_server(server);
-
-	smb_close_socket(server);
-
-	if (server->conn_pid)
-		kill_pid(server->conn_pid, SIGTERM, 1);
-
-	bdi_destroy(&server->bdi);
-	kfree(server->ops);
-	smb_unload_nls(server);
-	sb->s_fs_info = NULL;
-	smb_unlock_server(server);
-	put_pid(server->conn_pid);
-	kfree(server);
-
-	unlock_kernel();
-}
-
-static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-	struct smb_sb_info *server;
-	struct smb_mount_data_kernel *mnt;
-	struct smb_mount_data *oldmnt;
-	struct inode *root_inode;
-	struct smb_fattr root;
-	int ver;
-	void *mem;
-	static int warn_count;
-
-	if (warn_count < 5) {
-		warn_count++;
-		printk(KERN_EMERG "smbfs is deprecated and will be removed"
-			" from the 2.6.27 kernel. Please migrate to cifs\n");
-	}
-
-	if (!raw_data)
-		goto out_no_data;
-
-	oldmnt = (struct smb_mount_data *) raw_data;
-	ver = oldmnt->version;
-	if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
-		goto out_wrong_data;
-
-	sb->s_flags |= MS_NODIRATIME;
-	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
-	sb->s_blocksize_bits = 10;
-	sb->s_magic = SMB_SUPER_MAGIC;
-	sb->s_op = &smb_sops;
-	sb->s_time_gran = 100;
-
-	server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
-	if (!server)
-		goto out_no_server;
-	sb->s_fs_info = server;
-	
-	if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
-		goto out_bdi;
-
-	sb->s_bdi = &server->bdi;
-
-	server->super_block = sb;
-	server->mnt = NULL;
-	server->sock_file = NULL;
-	init_waitqueue_head(&server->conn_wq);
-	init_MUTEX(&server->sem);
-	INIT_LIST_HEAD(&server->entry);
-	INIT_LIST_HEAD(&server->xmitq);
-	INIT_LIST_HEAD(&server->recvq);
-	server->conn_error = 0;
-	server->conn_pid = NULL;
-	server->state = CONN_INVALID; /* no connection yet */
-	server->generation = 0;
-
-	/* Allocate the global temp buffer and some superblock helper structs */
-	/* FIXME: move these to the smb_sb_info struct */
-	VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
-		sizeof(struct smb_mount_data_kernel));
-	mem = kmalloc(sizeof(struct smb_ops) +
-		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
-	if (!mem)
-		goto out_no_mem;
-
-	server->ops = mem;
-	smb_install_null_ops(server->ops);
-	server->mnt = mem + sizeof(struct smb_ops);
-
-	/* Setup NLS stuff */
-	server->remote_nls = NULL;
-	server->local_nls = NULL;
-
-	mnt = server->mnt;
-
-	memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
-	strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
-		SMB_NLS_MAXNAMELEN);
-	strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
-		SMB_NLS_MAXNAMELEN);
-
-	mnt->ttl = SMB_TTL_DEFAULT;
-	if (ver == SMB_MOUNT_OLDVERSION) {
-		mnt->version = oldmnt->version;
-
-		SET_UID(mnt->uid, oldmnt->uid);
-		SET_GID(mnt->gid, oldmnt->gid);
-
-		mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
-		mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-
-		mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
-			SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
-	} else {
-		mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-				S_IROTH | S_IXOTH | S_IFREG;
-		mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-				S_IROTH | S_IXOTH | S_IFDIR;
-		if (parse_options(mnt, raw_data))
-			goto out_bad_option;
-	}
-	mnt->mounted_uid = current_uid();
-	smb_setcodepage(server, &mnt->codepage);
-
-	/*
-	 * Display the enabled options
-	 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
-	 */
-	if (mnt->flags & SMB_MOUNT_OLDATTR)
-		printk("SMBFS: Using core getattr (Win 95 speedup)\n");
-	else if (mnt->flags & SMB_MOUNT_DIRATTR)
-		printk("SMBFS: Using dir ff getattr\n");
-
-	if (smbiod_register_server(server) < 0) {
-		printk(KERN_ERR "smbfs: failed to start smbiod\n");
-		goto out_no_smbiod;
-	}
-
-	/*
-	 * Keep the super block locked while we get the root inode.
-	 */
-	smb_init_root_dirent(server, &root, sb);
-	root_inode = smb_iget(sb, &root);
-	if (!root_inode)
-		goto out_no_root;
-
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_no_root;
-
-	smb_new_dentry(sb->s_root);
-
-	return 0;
-
-out_no_root:
-	iput(root_inode);
-out_no_smbiod:
-	smb_unload_nls(server);
-out_bad_option:
-	kfree(mem);
-out_no_mem:
-	bdi_destroy(&server->bdi);
-out_bdi:
-	if (!server->mnt)
-		printk(KERN_ERR "smb_fill_super: allocation failure\n");
-	sb->s_fs_info = NULL;
-	kfree(server);
-	goto out_fail;
-out_wrong_data:
-	printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
-	goto out_fail;
-out_no_data:
-	printk(KERN_ERR "smb_fill_super: missing data argument\n");
-out_fail:
-	return -EINVAL;
-out_no_server:
-	printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
-	return -ENOMEM;
-}
-
-static int
-smb_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	int result;
-	
-	lock_kernel();
-
-	result = smb_proc_dskattr(dentry, buf);
-
-	unlock_kernel();
-
-	buf->f_type = SMB_SUPER_MAGIC;
-	buf->f_namelen = SMB_MAXPATHLEN;
-	return result;
-}
-
-int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
-{
-	int err = smb_revalidate_inode(dentry);
-	if (!err)
-		generic_fillattr(dentry->d_inode, stat);
-	return err;
-}
-
-int
-smb_notify_change(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
-	int error, changed, refresh = 0;
-	struct smb_fattr fattr;
-
-	lock_kernel();
-
-	error = smb_revalidate_inode(dentry);
-	if (error)
-		goto out;
-
-	if ((error = inode_change_ok(inode, attr)) < 0)
-		goto out;
-
-	error = -EPERM;
-	if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_SIZE) != 0) {
-		VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
-			DENTRY_PATH(dentry),
-			(long) inode->i_size, (long) attr->ia_size);
-
-		filemap_write_and_wait(inode->i_mapping);
-
-		error = smb_open(dentry, O_WRONLY);
-		if (error)
-			goto out;
-		error = server->ops->truncate(inode, attr->ia_size);
-		if (error)
-			goto out;
-		truncate_setsize(inode, attr->ia_size);
-		refresh = 1;
-	}
-
-	if (server->opt.capabilities & SMB_CAP_UNIX) {
-		/* For now we don't want to set the size with setattr_unix */
-		attr->ia_valid &= ~ATTR_SIZE;
-		/* FIXME: only call if we actually want to set something? */
-		error = smb_proc_setattr_unix(dentry, attr, 0, 0);
-		if (!error)
-			refresh = 1;
-
-		goto out;
-	}
-
-	/*
-	 * Initialize the fattr and check for changed fields.
-	 * Note: CTIME under SMB is creation time rather than
-	 * change time, so we don't attempt to change it.
-	 */
-	smb_get_inode_attr(inode, &fattr);
-
-	changed = 0;
-	if ((attr->ia_valid & ATTR_MTIME) != 0) {
-		fattr.f_mtime = attr->ia_mtime;
-		changed = 1;
-	}
-	if ((attr->ia_valid & ATTR_ATIME) != 0) {
-		fattr.f_atime = attr->ia_atime;
-		/* Earlier protocols don't have an access time */
-		if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
-			changed = 1;
-	}
-	if (changed) {
-		error = smb_proc_settime(dentry, &fattr);
-		if (error)
-			goto out;
-		refresh = 1;
-	}
-
-	/*
-	 * Check for mode changes ... we're extremely limited in
-	 * what can be set for SMB servers: just the read-only bit.
-	 */
-	if ((attr->ia_valid & ATTR_MODE) != 0) {
-		VERBOSE("%s/%s mode change, old=%x, new=%x\n",
-			DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
-		changed = 0;
-		if (attr->ia_mode & S_IWUSR) {
-			if (fattr.attr & aRONLY) {
-				fattr.attr &= ~aRONLY;
-				changed = 1;
-			}
-		} else {
-			if (!(fattr.attr & aRONLY)) {
-				fattr.attr |= aRONLY;
-				changed = 1;
-			}
-		}
-		if (changed) {
-			error = smb_proc_setattr(dentry, &fattr);
-			if (error)
-				goto out;
-			refresh = 1;
-		}
-	}
-	error = 0;
-
-out:
-	if (refresh)
-		smb_refresh_inode(dentry);
-	unlock_kernel();
-	return error;
-}
-
-static int smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
-}
-
-static struct file_system_type smb_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "smbfs",
-	.get_sb		= smb_get_sb,
-	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_BINARY_MOUNTDATA,
-};
-
-static int __init init_smb_fs(void)
-{
-	int err;
-	DEBUG1("registering ...\n");
-
-	err = init_inodecache();
-	if (err)
-		goto out_inode;
-	err = smb_init_request_cache();
-	if (err)
-		goto out_request;
-	err = register_filesystem(&smb_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	smb_destroy_request_cache();
-out_request:
-	destroy_inodecache();
-out_inode:
-	return err;
-}
-
-static void __exit exit_smb_fs(void)
-{
-	DEBUG1("unregistering ...\n");
-	unregister_filesystem(&smb_fs_type);
-	smb_destroy_request_cache();
-	destroy_inodecache();
-}
-
-module_init(init_smb_fs)
-module_exit(exit_smb_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad39..000000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  ioctl.c
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/time.h>
-#include <linux/mm.h>
-#include <linux/highuid.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-
-#include <asm/uaccess.h>
-
-#include "proto.h"
-
-long
-smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
-	struct smb_conn_opt opt;
-	int result = -EINVAL;
-
-	lock_kernel();
-	switch (cmd) {
-		uid16_t uid16;
-		uid_t uid32;
-	case SMB_IOC_GETMOUNTUID:
-		SET_UID(uid16, server->mnt->mounted_uid);
-		result = put_user(uid16, (uid16_t __user *) arg);
-		break;
-	case SMB_IOC_GETMOUNTUID32:
-		SET_UID(uid32, server->mnt->mounted_uid);
-		result = put_user(uid32, (uid_t __user *) arg);
-		break;
-
-	case SMB_IOC_NEWCONN:
-		/* arg is smb_conn_opt, or NULL if no connection was made */
-		if (!arg) {
-			result = 0;
-			smb_lock_server(server);
-			server->state = CONN_RETRIED;
-			printk(KERN_ERR "Connection attempt failed!  [%d]\n",
-			       server->conn_error);
-			smbiod_flush(server);
-			smb_unlock_server(server);
-			break;
-		}
-
-		result = -EFAULT;
-		if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
-			result = smb_newconn(server, &opt);
-		break;
-	default:
-		break;
-	}
-	unlock_kernel();
-
-	return result;
-}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b4..000000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
-/*
- *  proc.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/dcache.h>
-#include <linux/nls.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <net/sock.h>
-
-#include <asm/string.h>
-#include <asm/div64.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-
-
-/* Features. Undefine if they cause problems, this should perhaps be a
-   config option. */
-#define SMBFS_POSIX_UNLINK 1
-
-/* Allow smb_retry to be interrupted. */
-#define SMB_RETRY_INTR
-
-#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
-#define SMB_CMD(packet)  (*(packet+8))
-#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
-
-#define SMB_DIRINFO_SIZE 43
-#define SMB_STATUS_SIZE  21
-
-#define SMB_ST_BLKSIZE	(PAGE_SIZE)
-#define SMB_ST_BLKSHIFT	(PAGE_SHIFT)
-
-static struct smb_ops smb_ops_core;
-static struct smb_ops smb_ops_os2;
-static struct smb_ops smb_ops_win95;
-static struct smb_ops smb_ops_winNT;
-static struct smb_ops smb_ops_unix;
-static struct smb_ops smb_ops_null;
-
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *fattr);
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-		    struct smb_fattr *fattr);
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-		      u16 attr);
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-		     struct inode *inode, struct smb_fattr *fattr);
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server);
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src);
-
-
-static void
-str_upper(char *name, int len)
-{
-	while (len--)
-	{
-		if (*name >= 'a' && *name <= 'z')
-			*name -= ('a' - 'A');
-		name++;
-	}
-}
-
-#if 0
-static void
-str_lower(char *name, int len)
-{
-	while (len--)
-	{
-		if (*name >= 'A' && *name <= 'Z')
-			*name += ('a' - 'A');
-		name++;
-	}
-}
-#endif
-
-/* reverse a string inline. This is used by the dircache walking routines */
-static void reverse_string(char *buf, int len)
-{
-	char c;
-	char *end = buf+len-1;
-
-	while(buf < end) {
-		c = *buf;
-		*(buf++) = *end;
-		*(end--) = c;
-	}
-}
-
-/* no conversion, just a wrapper for memcpy. */
-static int convert_memcpy(unsigned char *output, int olen,
-			  const unsigned char *input, int ilen,
-			  struct nls_table *nls_from,
-			  struct nls_table *nls_to)
-{
-	if (olen < ilen)
-		return -ENAMETOOLONG;
-	memcpy(output, input, ilen);
-	return ilen;
-}
-
-static inline int write_char(unsigned char ch, char *output, int olen)
-{
-	if (olen < 4)
-		return -ENAMETOOLONG;
-	sprintf(output, ":x%02x", ch);
-	return 4;
-}
-
-static inline int write_unichar(wchar_t ch, char *output, int olen)
-{
-	if (olen < 5)
-		return -ENAMETOOLONG;
-	sprintf(output, ":%04x", ch);
-	return 5;
-}
-
-/* convert from one "codepage" to another (possibly being utf8). */
-static int convert_cp(unsigned char *output, int olen,
-		      const unsigned char *input, int ilen,
-		      struct nls_table *nls_from,
-		      struct nls_table *nls_to)
-{
-	int len = 0;
-	int n;
-	wchar_t ch;
-
-	while (ilen > 0) {
-		/* convert by changing to unicode and back to the new cp */
-		n = nls_from->char2uni(input, ilen, &ch);
-		if (n == -EINVAL) {
-			ilen--;
-			n = write_char(*input++, output, olen);
-			if (n < 0)
-				goto fail;
-			output += n;
-			olen -= n;
-			len += n;
-			continue;
-		} else if (n < 0)
-			goto fail;
-		input += n;
-		ilen -= n;
-
-		n = nls_to->uni2char(ch, output, olen);
-		if (n == -EINVAL)
-			n = write_unichar(ch, output, olen);
-		if (n < 0)
-			goto fail;
-		output += n;
-		olen -= n;
-
-		len += n;
-	}
-	return len;
-fail:
-	return n;
-}
-
-/* ----------------------------------------------------------- */
-
-/*
- * nls_unicode
- *
- * This encodes/decodes little endian unicode format
- */
-
-static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
-{
-	if (boundlen < 2)
-		return -EINVAL;
-	*out++ = uni & 0xff;
-	*out++ = uni >> 8;
-	return 2;
-}
-
-static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
-{
-	if (boundlen < 2)
-		return -EINVAL;
-	*uni = (rawstring[1] << 8) | rawstring[0];
-	return 2;
-}
-
-static struct nls_table unicode_table = {
-	.charset	= "unicode",
-	.uni2char	= uni2char,
-	.char2uni	= char2uni,
-};
-
-/* ----------------------------------------------------------- */
-
-static int setcodepage(struct nls_table **p, char *name)
-{
-	struct nls_table *nls;
-
-	if (!name || !*name) {
-		nls = NULL;
-	} else if ( (nls = load_nls(name)) == NULL) {
-		printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
-		return -EINVAL;
-	}
-
-	/* if already set, unload the previous one. */
-	if (*p && *p != &unicode_table)
-		unload_nls(*p);
-	*p = nls;
-
-	return 0;
-}
-
-/* Handles all changes to codepage settings. */
-int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
-{
-	int n = 0;
-
-	smb_lock_server(server);
-
-	/* Don't load any nls_* at all, if no remote is requested */
-	if (!*cp->remote_name)
-		goto out;
-
-	/* local */
-	n = setcodepage(&server->local_nls, cp->local_name);
-	if (n != 0)
-		goto out;
-
-	/* remote */
-	if (!strcmp(cp->remote_name, "unicode")) {
-		server->remote_nls = &unicode_table;
-	} else {
-		n = setcodepage(&server->remote_nls, cp->remote_name);
-		if (n != 0)
-			setcodepage(&server->local_nls, NULL);
-	}
-
-out:
-	if (server->local_nls != NULL && server->remote_nls != NULL)
-		server->ops->convert = convert_cp;
-	else
-		server->ops->convert = convert_memcpy;
-
-	smb_unlock_server(server);
-	return n;
-}
-
-
-/*****************************************************************************/
-/*                                                                           */
-/*  Encoding/Decoding section                                                */
-/*                                                                           */
-/*****************************************************************************/
-
-static __u8 *
-smb_encode_smb_length(__u8 * p, __u32 len)
-{
-	*p = 0;
-	*(p+1) = 0;
-	*(p+2) = (len & 0xFF00) >> 8;
-	*(p+3) = (len & 0xFF);
-	if (len > 0xFFFF)
-	{
-		*(p+1) = 1;
-	}
-	return p + 4;
-}
-
-/*
- * smb_build_path: build the path to entry and name storing it in buf.
- * The path returned will have the trailing '\0'.
- */
-static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
-			  int maxlen,
-			  struct dentry *entry, struct qstr *name)
-{
-	unsigned char *path = buf;
-	int len;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
-
-	if (maxlen < (2<<unicode))
-		return -ENAMETOOLONG;
-
-	if (maxlen > SMB_MAXPATHLEN + 1)
-		maxlen = SMB_MAXPATHLEN + 1;
-
-	if (entry == NULL)
-		goto test_name_and_out;
-
-	/*
-	 * If IS_ROOT, we have to do no walking at all.
-	 */
-	if (IS_ROOT(entry) && !name) {
-		*path++ = '\\';
-		if (unicode) *path++ = '\0';
-		*path++ = '\0';
-		if (unicode) *path++ = '\0';
-		return path-buf;
-	}
-
-	/*
-	 * Build the path string walking the tree backward from end to ROOT
-	 * and store it in reversed order [see reverse_string()]
-	 */
-	dget(entry);
-	spin_lock(&entry->d_lock);
-	while (!IS_ROOT(entry)) {
-		struct dentry *parent;
-
-		if (maxlen < (3<<unicode)) {
-			spin_unlock(&entry->d_lock);
-			dput(entry);
-			return -ENAMETOOLONG;
-		}
-
-		len = server->ops->convert(path, maxlen-2, 
-				      entry->d_name.name, entry->d_name.len,
-				      server->local_nls, server->remote_nls);
-		if (len < 0) {
-			spin_unlock(&entry->d_lock);
-			dput(entry);
-			return len;
-		}
-		reverse_string(path, len);
-		path += len;
-		if (unicode) {
-			/* Note: reverse order */
-			*path++ = '\0';
-			maxlen--;
-		}
-		*path++ = '\\';
-		maxlen -= len+1;
-
-		parent = entry->d_parent;
-		dget(parent);
-		spin_unlock(&entry->d_lock);
-		dput(entry);
-		entry = parent;
-		spin_lock(&entry->d_lock);
-	}
-	spin_unlock(&entry->d_lock);
-	dput(entry);
-	reverse_string(buf, path-buf);
-
-	/* maxlen has space for at least one char */
-test_name_and_out:
-	if (name) {
-		if (maxlen < (3<<unicode))
-			return -ENAMETOOLONG;
-		*path++ = '\\';
-		if (unicode) {
-			*path++ = '\0';
-			maxlen--;
-		}
-		len = server->ops->convert(path, maxlen-2, 
-				      name->name, name->len,
-				      server->local_nls, server->remote_nls);
-		if (len < 0)
-			return len;
-		path += len;
-		maxlen -= len+1;
-	}
-	/* maxlen has space for at least one char */
-	*path++ = '\0';
-	if (unicode) *path++ = '\0';
-	return path-buf;
-}
-
-static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
-			   struct dentry *dir, struct qstr *name)
-{
-	int result;
-
-	result = smb_build_path(server, buf, maxlen, dir, name);
-	if (result < 0)
-		goto out;
-	if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
-		str_upper(buf, result);
-out:
-	return result;
-}
-
-/* encode_path for non-trans2 request SMBs */
-static int smb_simple_encode_path(struct smb_request *req, char **p,
-				  struct dentry * entry, struct qstr * name)
-{
-	struct smb_sb_info *server = req->rq_server;
-	char *s = *p;
-	int res;
-	int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-
-	if (!maxlen)
-		return -ENAMETOOLONG;
-	*s++ = 4;	/* ASCII data format */
-
-	/*
-	 * SMB Unicode strings must be 16bit aligned relative the start of the
-	 * packet. If they are not they must be padded with 0.
-	 */
-	if (unicode) {
-		int align = s - (char *)req->rq_buffer;
-		if (!(align & 1)) {
-			*s++ = '\0';
-			maxlen--;
-		}
-	}
-
-	res = smb_encode_path(server, s, maxlen-1, entry, name);
-	if (res < 0)
-		return res;
-	*p = s + res;
-	return 0;
-}
-
-/* The following are taken directly from msdos-fs */
-
-/* Linear day numbers of the respective 1sts in non-leap years. */
-
-static int day_n[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
-		  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
-
-
-static time_t
-utc2local(struct smb_sb_info *server, time_t time)
-{
-	return time - server->opt.serverzone*60;
-}
-
-static time_t
-local2utc(struct smb_sb_info *server, time_t time)
-{
-	return time + server->opt.serverzone*60;
-}
-
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-
-static time_t
-date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
-{
-	int month, year;
-	time_t secs;
-
-	/* first subtract and mask after that... Otherwise, if
-	   date == 0, bad things happen */
-	month = ((date >> 5) - 1) & 15;
-	year = date >> 9;
-	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
-	    ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
-						   month < 2 ? 1 : 0) + 3653);
-	/* days since 1.1.70 plus 80's leap day */
-	return local2utc(server, secs);
-}
-
-
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-
-static void
-date_unix2dos(struct smb_sb_info *server,
-	      int unix_date, __u16 *date, __u16 *time)
-{
-	int day, year, nl_day, month;
-
-	unix_date = utc2local(server, unix_date);
-	if (unix_date < 315532800)
-		unix_date = 315532800;
-
-	*time = (unix_date % 60) / 2 +
-		(((unix_date / 60) % 60) << 5) +
-		(((unix_date / 3600) % 24) << 11);
-
-	day = unix_date / 86400 - 3652;
-	year = day / 365;
-	if ((year + 3) / 4 + 365 * year > day)
-		year--;
-	day -= (year + 3) / 4 + 365 * year;
-	if (day == 59 && !(year & 3)) {
-		nl_day = day;
-		month = 2;
-	} else {
-		nl_day = (year & 3) || day <= 59 ? day : day - 1;
-		for (month = 1; month < 12; month++)
-			if (day_n[month] > nl_day)
-				break;
-	}
-	*date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
-}
-
-/* The following are taken from fs/ntfs/util.c */
-
-#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-
-/*
- * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
- * into Unix UTC (based 1970-01-01, in seconds).
- */
-static struct timespec
-smb_ntutc2unixutc(u64 ntutc)
-{
-	struct timespec ts;
-	/* FIXME: what about the timezone difference? */
-	/* Subtract the NTFS time offset, then convert to 1s intervals. */
-	u64 t = ntutc - NTFS_TIME_OFFSET;
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t; 
-	return ts;
-}
-
-/* Convert the Unix UTC into NT time */
-static u64
-smb_unixutc2ntutc(struct timespec ts)
-{
-	/* Note: timezone conversion is probably wrong. */
-	/* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
-	return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
-}
-
-#define MAX_FILE_MODE	6
-static mode_t file_mode[] = {
-	S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
-};
-
-static int smb_filetype_to_mode(u32 filetype)
-{
-	if (filetype > MAX_FILE_MODE) {
-		PARANOIA("Filetype out of range: %d\n", filetype);
-		return S_IFREG;
-	}
-	return file_mode[filetype];
-}
-
-static u32 smb_filetype_from_mode(int mode)
-{
-	if (S_ISREG(mode))
-		return UNIX_TYPE_FILE;
-	if (S_ISDIR(mode))
-		return UNIX_TYPE_DIR;
-	if (S_ISLNK(mode))
-		return UNIX_TYPE_SYMLINK;
-	if (S_ISCHR(mode))
-		return UNIX_TYPE_CHARDEV;
-	if (S_ISBLK(mode))
-		return UNIX_TYPE_BLKDEV;
-	if (S_ISFIFO(mode))
-		return UNIX_TYPE_FIFO;
-	if (S_ISSOCK(mode))
-		return UNIX_TYPE_SOCKET;
-	return UNIX_TYPE_UNKNOWN;
-}
-
-
-/*****************************************************************************/
-/*                                                                           */
-/*  Support section.                                                         */
-/*                                                                           */
-/*****************************************************************************/
-
-__u32
-smb_len(__u8 * p)
-{
-	return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
-}
-
-static __u16
-smb_bcc(__u8 * packet)
-{
-	int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
-	return WVAL(packet, pos);
-}
-
-/* smb_valid_packet: We check if packet fulfills the basic
-   requirements of a smb packet */
-
-static int
-smb_valid_packet(__u8 * packet)
-{
-	return (packet[4] == 0xff
-		&& packet[5] == 'S'
-		&& packet[6] == 'M'
-		&& packet[7] == 'B'
-		&& (smb_len(packet) + 4 == SMB_HEADER_LEN
-		    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
-}
-
-/* smb_verify: We check if we got the answer we expected, and if we
-   got enough data. If bcc == -1, we don't care. */
-
-static int
-smb_verify(__u8 * packet, int command, int wct, int bcc)
-{
-	if (SMB_CMD(packet) != command)
-		goto bad_command;
-	if (SMB_WCT(packet) < wct)
-		goto bad_wct;
-	if (bcc != -1 && smb_bcc(packet) < bcc)
-		goto bad_bcc;
-	return 0;
-
-bad_command:
-	printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
-	       command, SMB_CMD(packet));
-	goto fail;
-bad_wct:
-	printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
-	       command, wct, SMB_WCT(packet));
-	goto fail;
-bad_bcc:
-	printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
-	       command, bcc, smb_bcc(packet));
-fail:
-	return -EIO;
-}
-
-/*
- * Returns the maximum read or write size for the "payload". Making all of the
- * packet fit within the negotiated max_xmit size.
- *
- * N.B. Since this value is usually computed before locking the server,
- * the server's packet size must never be decreased!
- */
-static inline int
-smb_get_xmitsize(struct smb_sb_info *server, int overhead)
-{
-	return server->opt.max_xmit - overhead;
-}
-
-/*
- * Calculate the maximum read size
- */
-int
-smb_get_rsize(struct smb_sb_info *server)
-{
-	/* readX has 12 parameters, read has 5 */
-	int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
-	int size = smb_get_xmitsize(server, overhead);
-
-	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-
-	return size;
-}
-
-/*
- * Calculate the maximum write size
- */
-int
-smb_get_wsize(struct smb_sb_info *server)
-{
-	/* writeX has 14 parameters, write has 5 */
-	int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
-	int size = smb_get_xmitsize(server, overhead);
-
-	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-
-	return size;
-}
-
-/*
- * Convert SMB error codes to -E... errno values.
- */
-int
-smb_errno(struct smb_request *req)
-{
-	int errcls = req->rq_rcls;
-	int error  = req->rq_err;
-	char *class = "Unknown";
-
-	VERBOSE("errcls %d  code %d  from command 0x%x\n",
-		errcls, error, SMB_CMD(req->rq_header));
-
-	if (errcls == ERRDOS) {
-		switch (error) {
-		case ERRbadfunc:
-			return -EINVAL;
-		case ERRbadfile:
-		case ERRbadpath:
-			return -ENOENT;
-		case ERRnofids:
-			return -EMFILE;
-		case ERRnoaccess:
-			return -EACCES;
-		case ERRbadfid:
-			return -EBADF;
-		case ERRbadmcb:
-			return -EREMOTEIO;
-		case ERRnomem:
-			return -ENOMEM;
-		case ERRbadmem:
-			return -EFAULT;
-		case ERRbadenv:
-		case ERRbadformat:
-			return -EREMOTEIO;
-		case ERRbadaccess:
-			return -EACCES;
-		case ERRbaddata:
-			return -E2BIG;
-		case ERRbaddrive:
-			return -ENXIO;
-		case ERRremcd:
-			return -EREMOTEIO;
-		case ERRdiffdevice:
-			return -EXDEV;
-		case ERRnofiles:
-			return -ENOENT;
-		case ERRbadshare:
-			return -ETXTBSY;
-		case ERRlock:
-			return -EDEADLK;
-		case ERRfilexists:
-			return -EEXIST;
-		case ERROR_INVALID_PARAMETER:
-			return -EINVAL;
-		case ERROR_DISK_FULL:
-			return -ENOSPC;
-		case ERROR_INVALID_NAME:
-			return -ENOENT;
-		case ERROR_DIR_NOT_EMPTY:
-			return -ENOTEMPTY;
-		case ERROR_NOT_LOCKED:
-                       return -ENOLCK;
-		case ERROR_ALREADY_EXISTS:
-			return -EEXIST;
-		default:
-			class = "ERRDOS";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRSRV) {
-		switch (error) {
-		/* N.B. This is wrong ... EIO ? */
-		case ERRerror:
-			return -ENFILE;
-		case ERRbadpw:
-			return -EINVAL;
-		case ERRbadtype:
-		case ERRtimeout:
-			return -EIO;
-		case ERRaccess:
-			return -EACCES;
-		/*
-		 * This is a fatal error, as it means the "tree ID"
-		 * for this connection is no longer valid. We map
-		 * to a special error code and get a new connection.
-		 */
-		case ERRinvnid:
-			return -EBADSLT;
-		default:
-			class = "ERRSRV";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRHRD) {
-		switch (error) {
-		case ERRnowrite:
-			return -EROFS;
-		case ERRbadunit:
-			return -ENODEV;
-		case ERRnotready:
-			return -EUCLEAN;
-		case ERRbadcmd:
-		case ERRdata:
-			return -EIO;
-		case ERRbadreq:
-			return -ERANGE;
-		case ERRbadshare:
-			return -ETXTBSY;
-		case ERRlock:
-			return -EDEADLK;
-		case ERRdiskfull:
-			return -ENOSPC;
-		default:
-			class = "ERRHRD";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRCMD) {
-		class = "ERRCMD";
-	} else if (errcls == SUCCESS) {
-		return 0;	/* This is the only valid 0 return */
-	}
-
-err_unknown:
-	printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
-	       class, error, SMB_CMD(req->rq_header));
-	return -EIO;
-}
-
-/* smb_request_ok: We expect the server to be locked. Then we do the
-   request and check the answer completely. When smb_request_ok
-   returns 0, you can be quite sure that everything went well. When
-   the answer is <=0, the returned number is a valid unix errno. */
-
-static int
-smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
-{
-	int result;
-
-	req->rq_resp_wct = wct;
-	req->rq_resp_bcc = bcc;
-
-	result = smb_add_request(req);
-	if (result != 0) {
-		DEBUG1("smb_request failed\n");
-		goto out;
-	}
-
-	if (smb_valid_packet(req->rq_header) != 0) {
-		PARANOIA("invalid packet!\n");
-		goto out;
-	}
-
-	result = smb_verify(req->rq_header, command, wct, bcc);
-
-out:
-	return result;
-}
-
-/*
- * This implements the NEWCONN ioctl. It installs the server pid,
- * sets server->state to CONN_VALID, and wakes up the waiting process.
- */
-int
-smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
-{
-	struct file *filp;
-	struct sock *sk;
-	int error;
-
-	VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
-
-	smb_lock_server(server);
-
-	/*
-	 * Make sure we don't already have a valid connection ...
-	 */
-	error = -EINVAL;
-	if (server->state == CONN_VALID)
-		goto out;
-
-	error = -EACCES;
-	if (current_uid() != server->mnt->mounted_uid &&
-	    !capable(CAP_SYS_ADMIN))
-		goto out;
-
-	error = -EBADF;
-	filp = fget(opt->fd);
-	if (!filp)
-		goto out;
-	if (!smb_valid_socket(filp->f_path.dentry->d_inode))
-		goto out_putf;
-
-	server->sock_file = filp;
-	server->conn_pid = get_pid(task_pid(current));
-	server->opt = *opt;
-	server->generation += 1;
-	server->state = CONN_VALID;
-	error = 0;
-
-	if (server->conn_error) {
-		/*
-		 * conn_error is the returncode we originally decided to
-		 * drop the old connection on. This message should be positive
-		 * and not make people ask questions on why smbfs is printing
-		 * error messages ...
-		 */
-		printk(KERN_INFO "SMB connection re-established (%d)\n",
-		       server->conn_error);
-		server->conn_error = 0;
-	}
-
-	/*
-	 * Store the server in sock user_data (Only used by sunrpc)
-	 */
-	sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
-	sk->sk_user_data = server;
-
-	/* chain into the data_ready callback */
-	server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
-
-	/* check if we have an old smbmount that uses seconds for the 
-	   serverzone */
-	if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
-		server->opt.serverzone /= 60;
-
-	/* now that we have an established connection we can detect the server
-	   type and enable bug workarounds */
-	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
-		install_ops(server->ops, &smb_ops_core);
-	else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
-		install_ops(server->ops, &smb_ops_os2);
-	else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
-		 (server->opt.max_xmit < 0x1000) &&
-		 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
-		/* FIXME: can we kill the WIN95 flag now? */
-		server->mnt->flags |= SMB_MOUNT_WIN95;
-		VERBOSE("detected WIN95 server\n");
-		install_ops(server->ops, &smb_ops_win95);
-	} else {
-		/*
-		 * Samba has max_xmit 65535
-		 * NT4spX has max_xmit 4536 (or something like that)
-		 * win2k has ...
-		 */
-		VERBOSE("detected NT1 (Samba, NT4/5) server\n");
-		install_ops(server->ops, &smb_ops_winNT);
-	}
-
-	/* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
-	if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
-		server->ops->getattr = smb_proc_getattr_core;
-	} else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
-		server->ops->getattr = smb_proc_getattr_ff;
-	}
-
-	/* Decode server capabilities */
-	if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
-		/* Should be ok to set this now, as no one can access the
-		   mount until the connection has been established. */
-		SB_of(server)->s_maxbytes = ~0ULL >> 1;
-		VERBOSE("LFS enabled\n");
-	}
-	if (server->opt.capabilities & SMB_CAP_UNICODE) {
-		server->mnt->flags |= SMB_MOUNT_UNICODE;
-		VERBOSE("Unicode enabled\n");
-	} else {
-		server->mnt->flags &= ~SMB_MOUNT_UNICODE;
-	}
-#if 0
-	/* flags we may test for other patches ... */
-	if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
-		VERBOSE("Large reads enabled\n");
-	}
-	if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
-		VERBOSE("Large writes enabled\n");
-	}
-#endif
-	if (server->opt.capabilities & SMB_CAP_UNIX) {
-		struct inode *inode;
-		VERBOSE("Using UNIX CIFS extensions\n");
-		install_ops(server->ops, &smb_ops_unix);
-		inode = SB_of(server)->s_root->d_inode;
-		if (inode)
-			inode->i_op = &smb_dir_inode_operations_unix;
-	}
-
-	VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
-		server->opt.protocol, server->opt.max_xmit,
-		pid_nr(server->conn_pid), server->opt.capabilities);
-
-	/* FIXME: this really should be done by smbmount. */
-	if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
-		server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
-	}
-
-	smb_unlock_server(server);
-	smbiod_wake_up();
-	if (server->opt.capabilities & SMB_CAP_UNIX)
-		smb_proc_query_cifsunix(server);
-
-	server->conn_complete++;
-	wake_up_interruptible_all(&server->conn_wq);
-	return error;
-
-out:
-	smb_unlock_server(server);
-	smbiod_wake_up();
-	return error;
-
-out_putf:
-	fput(filp);
-	goto out;
-}
-
-/* smb_setup_header: We completely set up the packet. You only have to
-   insert the command-specific fields */
-
-__u8 *
-smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
-{
-	__u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
-	__u8 *p = req->rq_header;
-	struct smb_sb_info *server = req->rq_server;
-
-	p = smb_encode_smb_length(p, xmit_len - 4);
-
-	*p++ = 0xff;
-	*p++ = 'S';
-	*p++ = 'M';
-	*p++ = 'B';
-	*p++ = command;
-
-	memset(p, '\0', 19);
-	p += 19;
-	p += 8;
-
-	if (server->opt.protocol > SMB_PROTOCOL_CORE) {
-		int flags = SMB_FLAGS_CASELESS_PATHNAMES;
-		int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
-			SMB_FLAGS2_EXTENDED_ATTRIBUTES;	/* EA? not really ... */
-
-		*(req->rq_header + smb_flg) = flags;
-		if (server->mnt->flags & SMB_MOUNT_UNICODE)
-			flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
-		WSET(req->rq_header, smb_flg2, flags2);
-	}
-	*p++ = wct;		/* wct */
-	p += 2 * wct;
-	WSET(p, 0, bcc);
-
-	/* Include the header in the data to send */
-	req->rq_iovlen = 1;
-	req->rq_iov[0].iov_base = req->rq_header;
-	req->rq_iov[0].iov_len  = xmit_len - bcc;
-
-	return req->rq_buffer;
-}
-
-static void
-smb_setup_bcc(struct smb_request *req, __u8 *p)
-{
-	u16 bcc = p - req->rq_buffer;
-	u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
-
-	WSET(pbcc, 0, bcc);
-
-	smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
-			      2*SMB_WCT(req->rq_header) - 2 + bcc);
-
-	/* Include the "bytes" in the data to send */
-	req->rq_iovlen = 2;
-	req->rq_iov[1].iov_base = req->rq_buffer;
-	req->rq_iov[1].iov_len  = bcc;
-}
-
-static int
-smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
-	      __u16 mode, off_t offset)
-{
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBlseek, 4, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	WSET(req->rq_header, smb_vwv1, mode);
-	DSET(req->rq_header, smb_vwv2, offset);
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBlseek, 2, 0);
-	if (result < 0) {
-		result = 0;
-		goto out_free;
-	}
-
-	result = DVAL(req->rq_header, smb_vwv0);
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
-{
-	struct inode *ino = dentry->d_inode;
-	struct smb_inode_info *ei = SMB_I(ino);
-	int mode, read_write = 0x42, read_only = 0x40;
-	int res;
-	char *p;
-	struct smb_request *req;
-
-	/*
-	 * Attempt to open r/w, unless there are no write privileges.
-	 */
-	mode = read_write;
-	if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
-		mode = read_only;
-#if 0
-	/* FIXME: why is this code not in? below we fix it so that a caller
-	   wanting RO doesn't get RW. smb_revalidate_inode does some 
-	   optimization based on access mode. tail -f needs it to be correct.
-
-	   We must open rw since we don't do the open if called a second time
-	   with different 'wish'. Is that not supported by smb servers? */
-	if (!(wish & (O_WRONLY | O_RDWR)))
-		mode = read_only;
-#endif
-
-	res = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-      retry:
-	p = smb_setup_header(req, SMBopen, 2, 0);
-	WSET(req->rq_header, smb_vwv0, mode);
-	WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
-	res = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (res < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	res = smb_request_ok(req, SMBopen, 7, 0);
-	if (res != 0) {
-		if (mode == read_write &&
-		    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
-		{
-			VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
-				DENTRY_PATH(dentry), res);
-			mode = read_only;
-			req->rq_flags = 0;
-			goto retry;
-		}
-		goto out_free;
-	}
-	/* We should now have data in vwv[0..6]. */
-
-	ei->fileid = WVAL(req->rq_header, smb_vwv0);
-	ei->attr   = WVAL(req->rq_header, smb_vwv1);
-	/* smb_vwv2 has mtime */
-	/* smb_vwv4 has size  */
-	ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
-	ei->open = server->generation;
-
-out_free:
-	smb_rput(req);
-out:
-	return res;
-}
-
-/*
- * Make sure the file is open, and check that the access
- * is compatible with the desired access.
- */
-int
-smb_open(struct dentry *dentry, int wish)
-{
-	struct inode *inode = dentry->d_inode;
-	int result;
-	__u16 access;
-
-	result = -ENOENT;
-	if (!inode) {
-		printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
-		       DENTRY_PATH(dentry));
-		goto out;
-	}
-
-	if (!smb_is_open(inode)) {
-		struct smb_sb_info *server = server_from_inode(inode);
-		result = 0;
-		if (!smb_is_open(inode))
-			result = smb_proc_open(server, dentry, wish);
-		if (result)
-			goto out;
-		/*
-		 * A successful open means the path is still valid ...
-		 */
-		smb_renew_times(dentry);
-	}
-
-	/*
-	 * Check whether the access is compatible with the desired mode.
-	 */
-	result = 0;
-	access = SMB_I(inode)->access;
-	if (access != wish && access != SMB_O_RDWR) {
-		PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
-			 DENTRY_PATH(dentry), access, wish);
-		result = -EACCES;
-	}
-out:
-	return result;
-}
-
-static int 
-smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
-{
-	struct smb_request *req;
-	int result = -ENOMEM;
-
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBclose, 3, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBclose, 0, 0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Win NT 4.0 has an apparent bug in that it fails to update the
- * modify time when writing to a file. As a workaround, we update
- * both modify and access time locally, and post the times to the
- * server when closing the file.
- */
-static int 
-smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
-{
-	struct smb_inode_info *ei = SMB_I(ino);
-	int result = 0;
-	if (smb_is_open(ino))
-	{
-		/*
-		 * We clear the open flag in advance, in case another
- 		 * process observes the value while we block below.
-		 */
-		ei->open = 0;
-
-		/*
-		 * Kludge alert: SMB timestamps are accurate only to
-		 * two seconds ... round the times to avoid needless
-		 * cache invalidations!
-		 */
-		if (ino->i_mtime.tv_sec & 1) { 
-			ino->i_mtime.tv_sec--;
-			ino->i_mtime.tv_nsec = 0; 
-		}
-		if (ino->i_atime.tv_sec & 1) {
-			ino->i_atime.tv_sec--;
-			ino->i_atime.tv_nsec = 0;
-		}
-		/*
-		 * If the file is open with write permissions,
-		 * update the time stamps to sync mtime and atime.
-		 */
-		if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
-		    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
-		    !(ei->access == SMB_O_RDONLY))
-		{
-			struct smb_fattr fattr;
-			smb_get_inode_attr(ino, &fattr);
-			smb_proc_setattr_ext(server, ino, &fattr);
-		}
-
-		result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
-		/*
-		 * Force a revalidation after closing ... some servers
-		 * don't post the size until the file has been closed.
-		 */
-		if (server->opt.protocol < SMB_PROTOCOL_NT1)
-			ei->oldmtime = 0;
-		ei->closed = jiffies;
-	}
-	return result;
-}
-
-int
-smb_close(struct inode *ino)
-{
-	int result = 0;
-
-	if (smb_is_open(ino)) {
-		struct smb_sb_info *server = server_from_inode(ino);
-		result = smb_proc_close_inode(server, ino);
-	}
-	return result;
-}
-
-/*
- * This is used to close a file following a failed instantiate.
- * Since we don't have an inode, we can't use any of the above.
- */
-int
-smb_close_fileid(struct dentry *dentry, __u16 fileid)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int result;
-
-	result = smb_proc_close(server, fileid, get_seconds());
-	return result;
-}
-
-/* In smb_proc_read and smb_proc_write we do not retry, because the
-   file-id would not be valid after a reconnection. */
-
-static void
-smb_proc_read_data(struct smb_request *req)
-{
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = 3;
-
-	req->rq_iov[1].iov_base = req->rq_page;
-	req->rq_iov[1].iov_len  = req->rq_rsize;
-	req->rq_iovlen = 2;
-
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-
-static int
-smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	__u16 returned_count, data_len;
-	unsigned char *buf;
-	int result;
-	struct smb_request *req;
-	u8 rbuf[4];
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBread, 5, 0);
-	buf = req->rq_header;
-	WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
-	WSET(buf, smb_vwv1, count);
-	DSET(buf, smb_vwv2, offset);
-	WSET(buf, smb_vwv4, 0);
-
-	req->rq_page = data;
-	req->rq_rsize = count;
-	req->rq_callback = smb_proc_read_data;
-	req->rq_buffer = rbuf;
-	req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
-
-	result = smb_request_ok(req, SMBread, 5, -1);
-	if (result < 0)
-		goto out_free;
-	returned_count = WVAL(req->rq_header, smb_vwv0);
-
-	data_len = WVAL(rbuf, 1);
-
-	if (returned_count != data_len) {
-		printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
-		printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
-		       returned_count, data_len);
-	}
-	result = data_len;
-
-out_free:
-	smb_rput(req);
-out:
-	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, result);
-	return result;
-}
-
-static int
-smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	u16 fileid = SMB_I(inode)->fileid;
-	u8 buf[4];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-		inode->i_ino, fileid, count, offset);
-
-	smb_setup_header(req, SMBwrite, 5, count + 3);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	WSET(req->rq_header, smb_vwv1, count);
-	DSET(req->rq_header, smb_vwv2, offset);
-	WSET(req->rq_header, smb_vwv4, 0);
-
-	buf[0] = 1;
-	WSET(buf, 1, count);	/* yes, again ... */
-	req->rq_iov[1].iov_base = buf;
-	req->rq_iov[1].iov_len = 3;
-	req->rq_iov[2].iov_base = (char *) data;
-	req->rq_iov[2].iov_len = count;
-	req->rq_iovlen = 3;
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBwrite, 1, 0);
-	if (result >= 0)
-		result = WVAL(req->rq_header, smb_vwv0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * In smb_proc_readX and smb_proc_writeX we do not retry, because the
- * file-id would not be valid after a reconnection.
- */
-
-#define SMB_READX_MAX_PAD      64
-static void
-smb_proc_readX_data(struct smb_request *req)
-{
-	/* header length, excluding the netbios length (-4) */
-	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-	int data_off = WVAL(req->rq_header, smb_vwv6);
-
-	/*
-	 * Some genius made the padding to the data bytes arbitrary.
-	 * So we must first calculate the amount of padding used by the server.
-	 */
-	data_off -= hdrlen;
-	if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
-		PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
-		PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
-		req->rq_rlen = req->rq_bufsize + 1;
-		return;
-	}
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = data_off;
-
-	req->rq_iov[1].iov_base = req->rq_page;
-	req->rq_iov[1].iov_len  = req->rq_rsize;
-	req->rq_iovlen = 2;
-
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-
-static int
-smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	unsigned char *buf;
-	int result;
-	struct smb_request *req;
-	static char pad[SMB_READX_MAX_PAD];
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBreadX, 12, 0);
-	buf = req->rq_header;
-	WSET(buf, smb_vwv0, 0x00ff);
-	WSET(buf, smb_vwv1, 0);
-	WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
-	DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
-	WSET(buf, smb_vwv5, count);
-	WSET(buf, smb_vwv6, 0);
-	DSET(buf, smb_vwv7, 0);
-	WSET(buf, smb_vwv9, 0);
-	DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
-	WSET(buf, smb_vwv11, 0);
-
-	req->rq_page = data;
-	req->rq_rsize = count;
-	req->rq_callback = smb_proc_readX_data;
-	req->rq_buffer = pad;
-	req->rq_bufsize = SMB_READX_MAX_PAD;
-	req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBreadX, 12, -1);
-	if (result < 0)
-		goto out_free;
-	result = WVAL(req->rq_header, smb_vwv5);
-
-out_free:
-	smb_rput(req);
-out:
-	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, result);
-	return result;
-}
-
-static int
-smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	u8 *p;
-	static u8 pad[4];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, offset);
-
-	p = smb_setup_header(req, SMBwriteX, 14, count + 1);
-	WSET(req->rq_header, smb_vwv0, 0x00ff);
-	WSET(req->rq_header, smb_vwv1, 0);
-	WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
-	DSET(req->rq_header, smb_vwv3, (u32)offset);	/* low 32 bits */
-	DSET(req->rq_header, smb_vwv5, 0);
-	WSET(req->rq_header, smb_vwv7, 0);		/* write mode */
-	WSET(req->rq_header, smb_vwv8, 0);
-	WSET(req->rq_header, smb_vwv9, 0);
-	WSET(req->rq_header, smb_vwv10, count);		/* data length */
-	WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
-	DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
-
-	req->rq_iov[1].iov_base = pad;
-	req->rq_iov[1].iov_len = 1;
-	req->rq_iov[2].iov_base = (char *) data;
-	req->rq_iov[2].iov_len = count;
-	req->rq_iovlen = 3;
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBwriteX, 6, 0);
- 	if (result >= 0)
-		result = WVAL(req->rq_header, smb_vwv2);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBcreate, 3, 0);
-	WSET(req->rq_header, smb_vwv0, attr);
-	DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, SMBcreate, 1, 0);
-	if (result < 0)
-		goto out_free;
-
-	*fileid = WVAL(req->rq_header, smb_vwv0);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(old_dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBmv, 1, 0);
-	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
-	result = smb_simple_encode_path(req, &p, old_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	result = smb_simple_encode_path(req, &p, new_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Code common to mkdir and rmdir.
- */
-static int
-smb_proc_generic_command(struct dentry *dentry, __u8 command)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, command, 0, 0);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, command, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_mkdir(struct dentry *dentry)
-{
-	return smb_proc_generic_command(dentry, SMBmkdir);
-}
-
-int
-smb_proc_rmdir(struct dentry *dentry)
-{
-	return smb_proc_generic_command(dentry, SMBrmdir);
-}
-
-#if SMBFS_POSIX_UNLINK
-/*
- * Removes readonly attribute from a file. Used by unlink to give posix
- * semantics.
- */
-static int
-smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
-{
-	int result;
-	struct smb_fattr fattr;
-
-	/* FIXME: cifsUE should allow removing a readonly file. */
-
-	/* first get current attribute */
-	smb_init_dirent(server, &fattr);
-	result = server->ops->getattr(server, dentry, &fattr);
-	smb_finish_dirent(server, &fattr);
-	if (result < 0)
-		return result;
-
-	/* if RONLY attribute is set, remove it */
-	if (fattr.attr & aRONLY) {  /* read only attribute is set */
-		fattr.attr &= ~aRONLY;
-		result = smb_proc_setattr_core(server, dentry, fattr.attr);
-	}
-	return result;
-}
-#endif
-
-int
-smb_proc_unlink(struct dentry *dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int flag = 0;
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-      retry:
-	p = smb_setup_header(req, SMBunlink, 1, 0);
-	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
-#if SMBFS_POSIX_UNLINK
-		if (result == -EACCES && !flag) {
-			/* Posix semantics is for the read-only state
-			   of a file to be ignored in unlink(). In the
-			   SMB world a unlink() is refused on a
-			   read-only file. To make things easier for
-			   unix users we try to override the files
-			   permission if the unlink fails with the
-			   right error.
-			   This introduces a race condition that could
-			   lead to a file being written by someone who
-			   shouldn't have access, but as far as I can
-			   tell that is unavoidable */
-
-			/* remove RONLY attribute and try again */
-			result = smb_set_rw(dentry,server);
-			if (result == 0) {
-				flag = 1;
-				req->rq_flags = 0;
-				goto retry;
-			}
-		}
-#endif
-		goto out_free;
-	}
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
-{
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBflush, 1, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBflush, 0, 0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_trunc32(struct inode *inode, loff_t length)
-{
-	/*
-	 * Writing 0bytes is old-SMB magic for truncating files.
-	 * MAX_NON_LFS should prevent this from being called with a too
-	 * large offset.
-	 */
-	return smb_proc_write(inode, length, 0, NULL);
-}
-
-static int
-smb_proc_trunc64(struct inode *inode, loff_t length)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	char *param;
-	char *data;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 14)))
-		goto out;
-
-	param = req->rq_buffer;
-	data = req->rq_buffer + 6;
-
-	/* FIXME: must we also set allocation size? winNT seems to do that */
-	WSET(param, 0, SMB_I(inode)->fileid);
-	WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
-	WSET(param, 4, 0);
-	LSET(data, 0, length);
-
-	req->rq_trans2_command = TRANSACT2_SETFILEINFO;
-	req->rq_ldata = 8;
-	req->rq_data  = data;
-	req->rq_lparm = 6;
-	req->rq_parm  = param;
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	result = 0;
-	if (req->rq_rcls != 0)
-		result = smb_errno(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_trunc95(struct inode *inode, loff_t length)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result = smb_proc_trunc32(inode, length);
- 
-	/*
-	 * win9x doesn't appear to update the size immediately.
-	 * It will return the old file size after the truncate,
-	 * confusing smbfs. So we force an update.
-	 *
-	 * FIXME: is this still necessary?
-	 */
-	smb_proc_flush(server, SMB_I(inode)->fileid);
-	return result;
-}
-
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-	memset(fattr, 0, sizeof(*fattr));
-
-	fattr->f_nlink = 1;
-	fattr->f_uid = server->mnt->uid;
-	fattr->f_gid = server->mnt->gid;
-	fattr->f_unix = 0;
-}
-
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-	if (fattr->f_unix)
-		return;
-
-	fattr->f_mode = server->mnt->file_mode;
-	if (fattr->attr & aDIR) {
-		fattr->f_mode = server->mnt->dir_mode;
-		fattr->f_size = SMB_ST_BLKSIZE;
-	}
-	/* Check the read-only flag */
-	if (fattr->attr & aRONLY)
-		fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
-
-	/* How many 512 byte blocks do we need for this file? */
-	fattr->f_blocks = 0;
-	if (fattr->f_size != 0)
-		fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
-	return;
-}
-
-void
-smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-		     struct super_block *sb)
-{
-	smb_init_dirent(server, fattr);
-	fattr->attr = aDIR;
-	fattr->f_ino = 2; /* traditional root inode number */
-	fattr->f_mtime = current_fs_time(sb);
-	smb_finish_dirent(server, fattr);
-}
-
-/*
- * Decode a dirent for old protocols
- *
- * qname is filled with the decoded, and possibly translated, name.
- * fattr receives decoded attributes
- *
- * Bugs Noted:
- * (1) Pathworks servers may pad the name with extra spaces.
- */
-static char *
-smb_decode_short_dirent(struct smb_sb_info *server, char *p,
-			struct qstr *qname, struct smb_fattr *fattr,
-			unsigned char *name_buf)
-{
-	int len;
-
-	/*
-	 * SMB doesn't have a concept of inode numbers ...
-	 */
-	smb_init_dirent(server, fattr);
-	fattr->f_ino = 0;	/* FIXME: do we need this? */
-
-	p += SMB_STATUS_SIZE;	/* reserved (search_status) */
-	fattr->attr = *p;
-	fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
-	fattr->f_mtime.tv_nsec = 0;
-	fattr->f_size = DVAL(p, 5);
-	fattr->f_ctime = fattr->f_mtime;
-	fattr->f_atime = fattr->f_mtime;
-	qname->name = p + 9;
-	len = strnlen(qname->name, 12);
-
-	/*
-	 * Trim trailing blanks for Pathworks servers
-	 */
-	while (len > 2 && qname->name[len-1] == ' ')
-		len--;
-
-	smb_finish_dirent(server, fattr);
-
-#if 0
-	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
-	   allow the flag to be set anyway. It kills const. Remove? */
-	switch (server->opt.case_handling) {
-	case SMB_CASE_UPPER:
-		str_upper(entry->name, len);
-		break;
-	case SMB_CASE_LOWER:
-		str_lower(entry->name, len);
-		break;
-	default:
-		break;
-	}
-#endif
-
-	qname->len = 0;
-	len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-				   qname->name, len,
-				   server->remote_nls, server->local_nls);
-	if (len > 0) {
-		qname->len = len;
-		qname->name = name_buf;
-		DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
-	}
-
-	return p + 22;
-}
-
-/*
- * This routine is used to read in directory entries from the network.
- * Note that it is for short directory name seeks, i.e.: protocol <
- * SMB_PROTOCOL_LANMAN2
- */
-static int
-smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
-		       struct smb_cache_control *ctl)
-{
-	struct dentry *dir = filp->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dir);
-	struct qstr qname;
-	struct smb_fattr fattr;
-	char *p;
-	int result;
-	int i, first, entries_seen, entries;
-	int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
-	__u16 bcc;
-	__u16 count;
-	char status[SMB_STATUS_SIZE];
-	static struct qstr mask = {
-		.name	= "*.*",
-		.len	= 3,
-	};
-	unsigned char *last_status;
-	struct smb_request *req;
-	unsigned char *name_buf;
-
-	VERBOSE("%s/%s\n", DENTRY_PATH(dir));
-
-	lock_kernel();
-
-	result = -ENOMEM;
-	if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
-		goto out;
-
-	first = 1;
-	entries = 0;
-	entries_seen = 2; /* implicit . and .. */
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-		goto out_name;
-
-	while (1) {
-		p = smb_setup_header(req, SMBsearch, 2, 0);
-		WSET(req->rq_header, smb_vwv0, entries_asked);
-		WSET(req->rq_header, smb_vwv1, aDIR);
-		if (first == 1) {
-			result = smb_simple_encode_path(req, &p, dir, &mask);
-			if (result < 0)
-				goto out_free;
-			if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
-				result = -ENAMETOOLONG;
-				goto out_free;
-			}
-			*p++ = 5;
-			WSET(p, 0, 0);
-			p += 2;
-			first = 0;
-		} else {
-			if (p + 5 + SMB_STATUS_SIZE >
-			    (char *)req->rq_buffer + req->rq_bufsize) {
-				result = -ENAMETOOLONG;
-				goto out_free;
-			}
-				
-			*p++ = 4;
-			*p++ = 0;
-			*p++ = 5;
-			WSET(p, 0, SMB_STATUS_SIZE);
-			p += 2;
-			memcpy(p, status, SMB_STATUS_SIZE);
-			p += SMB_STATUS_SIZE;
-		}
-
-		smb_setup_bcc(req, p);
-
-		result = smb_request_ok(req, SMBsearch, 1, -1);
-		if (result < 0) {
-			if ((req->rq_rcls == ERRDOS) && 
-			    (req->rq_err  == ERRnofiles))
-				break;
-			goto out_free;
-		}
-		count = WVAL(req->rq_header, smb_vwv0);
-		if (count <= 0)
-			break;
-
-		result = -EIO;
-		bcc = smb_bcc(req->rq_header);
-		if (bcc != count * SMB_DIRINFO_SIZE + 3)
-			goto out_free;
-		p = req->rq_buffer + 3;
-
-
-		/* Make sure the response fits in the buffer. Fixed sized 
-		   entries means we don't have to check in the decode loop. */
-
-		last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
-
-		if (last_status + SMB_DIRINFO_SIZE >=
-		    req->rq_buffer + req->rq_bufsize) {
-			printk(KERN_ERR "smb_proc_readdir_short: "
-			       "last dir entry outside buffer! "
-			       "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
-			       req->rq_bufsize, req->rq_buffer);
-			goto out_free;
-		}
-
-		/* Read the last entry into the status field. */
-		memcpy(status, last_status, SMB_STATUS_SIZE);
-
-
-		/* Now we are ready to parse smb directory entries. */
-
-		for (i = 0; i < count; i++) {
-			p = smb_decode_short_dirent(server, p, 
-						    &qname, &fattr, name_buf);
-			if (qname.len == 0)
-				continue;
-
-			if (entries_seen == 2 && qname.name[0] == '.') {
-				if (qname.len == 1)
-					continue;
-				if (qname.name[1] == '.' && qname.len == 2)
-					continue;
-			}
-			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-					    &qname, &fattr))
-				;	/* stop reading? */
-			entries_seen++;
-		}
-	}
-	result = entries;
-
-out_free:
-	smb_rput(req);
-out_name:
-	kfree(name_buf);
-out:
-	unlock_kernel();
-	return result;
-}
-
-static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
-{
-	u64 size, disk_bytes;
-
-	/* FIXME: verify nls support. all is sent as utf8? */
-
-	fattr->f_unix = 1;
-	fattr->f_mode = 0;
-
-	/* FIXME: use the uniqueID from the remote instead? */
-	/* 0 L file size in bytes */
-	/* 8 L file size on disk in bytes (block count) */
-	/* 40 L uid */
-	/* 48 L gid */
-	/* 56 W file type */
-	/* 60 L devmajor */
-	/* 68 L devminor */
-	/* 76 L unique ID (inode) */
-	/* 84 L permissions */
-	/* 92 L link count */
-
-	size = LVAL(p, 0);
-	disk_bytes = LVAL(p, 8);
-
-	/*
-	 * Some samba versions round up on-disk byte usage
-	 * to 1MB boundaries, making it useless. When seeing
-	 * that, use the size instead.
-	 */
-	if (!(disk_bytes & 0xfffff))
-		disk_bytes = size+511;
-
-	fattr->f_size = size;
-	fattr->f_blocks = disk_bytes >> 9;
-	fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
-	fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
-	fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-
-	if (server->mnt->flags & SMB_MOUNT_UID)
-		fattr->f_uid = server->mnt->uid;
-	else
-		fattr->f_uid = LVAL(p, 40);
-
-	if (server->mnt->flags & SMB_MOUNT_GID)
-		fattr->f_gid = server->mnt->gid;
-	else
-		fattr->f_gid = LVAL(p, 48);
-
-	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
-
-	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-		__u64 major = LVAL(p, 60);
-		__u64 minor = LVAL(p, 68);
-
-		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
-		if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-	    	MINOR(fattr->f_rdev) != (minor & 0xffffffff))
-			fattr->f_rdev = 0;
-	}
-
-	fattr->f_mode |= LVAL(p, 84);
-
-	if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
-	     (S_ISDIR(fattr->f_mode)) )
-		fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-	else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
-	          !(S_ISDIR(fattr->f_mode)) )
-		fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
-				(fattr->f_mode & S_IFMT);
-
-}
-
-/*
- * Interpret a long filename structure using the specified info level:
- *   level 1 for anything below NT1 protocol
- *   level 260 for NT1 protocol
- *
- * qname is filled with the decoded, and possibly translated, name
- * fattr receives decoded attributes.
- *
- * Bugs Noted:
- * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
- */
-static char *
-smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
-		       struct qstr *qname, struct smb_fattr *fattr,
-		       unsigned char *name_buf)
-{
-	char *result;
-	unsigned int len = 0;
-	int n;
-	__u16 date, time;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-
-	/*
-	 * SMB doesn't have a concept of inode numbers ...
-	 */
-	smb_init_dirent(server, fattr);
-	fattr->f_ino = 0;	/* FIXME: do we need this? */
-
-	switch (level) {
-	case 1:
-		len = *((unsigned char *) p + 22);
-		qname->name = p + 23;
-		result = p + 24 + len;
-
-		date = WVAL(p, 0);
-		time = WVAL(p, 2);
-		fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_ctime.tv_nsec = 0;
-
-		date = WVAL(p, 4);
-		time = WVAL(p, 6);
-		fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_atime.tv_nsec = 0;
-
-		date = WVAL(p, 8);
-		time = WVAL(p, 10);
-		fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_mtime.tv_nsec = 0;
-		fattr->f_size = DVAL(p, 12);
-		/* ULONG allocation size */
-		fattr->attr = WVAL(p, 20);
-
-		VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	case 260:
-		result = p + WVAL(p, 0);
-		len = DVAL(p, 60);
-		if (len > 255) len = 255;
-		/* NT4 null terminates, unless we are using unicode ... */
-		qname->name = p + 94;
-		if (!unicode && len && qname->name[len-1] == '\0')
-			len--;
-
-		fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
-		fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
-		fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
-		/* change time (32) */
-		fattr->f_size = LVAL(p, 40);
-		/* alloc size (48) */
-		fattr->attr = DVAL(p, 56);
-
-		VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	case SMB_FIND_FILE_UNIX:
-		result = p + WVAL(p, 0);
-		qname->name = p + 108;
-
-		len = strlen(qname->name);
-		/* FIXME: should we check the length?? */
-
-		p += 8;
-		smb_decode_unix_basic(fattr, server, p);
-		VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	default:
-		PARANOIA("Unknown info level %d\n", level);
-		result = p + WVAL(p, 0);
-		goto out;
-	}
-
-	smb_finish_dirent(server, fattr);
-
-#if 0
-	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
-	   allow the flag to be set anyway. Remove? */
-	switch (server->opt.case_handling) {
-	case SMB_CASE_UPPER:
-		str_upper(qname->name, len);
-		break;
-	case SMB_CASE_LOWER:
-		str_lower(qname->name, len);
-		break;
-	default:
-		break;
-	}
-#endif
-
-	qname->len = 0;
-	n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-				 qname->name, len,
-				 server->remote_nls, server->local_nls);
-	if (n > 0) {
-		qname->len = n;
-		qname->name = name_buf;
-	}
-
-out:
-	return result;
-}
-
-/* findfirst/findnext flags */
-#define SMB_CLOSE_AFTER_FIRST (1<<0)
-#define SMB_CLOSE_IF_END (1<<1)
-#define SMB_REQUIRE_RESUME_KEY (1<<2)
-#define SMB_CONTINUE_BIT (1<<3)
-
-/*
- * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
- * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
- * go there for advise.
- *
- * Bugs Noted:
- * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
- * for certain patterns of names and/or lengths. The breakage pattern
- * is completely reproducible and can be toggled by the creation of a
- * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
- */
-static int
-smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
-		      struct smb_cache_control *ctl)
-{
-	struct dentry *dir = filp->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dir);
-	struct qstr qname;
-	struct smb_fattr fattr;
-
-	unsigned char *p, *lastname;
-	char *mask, *param;
-	__u16 command;
-	int first, entries_seen;
-
-	/* Both NT and OS/2 accept info level 1 (but see note below). */
-	int info_level = 260;
-	const int max_matches = 512;
-
-	unsigned int ff_searchcount = 0;
-	unsigned int ff_eos = 0;
-	unsigned int ff_lastname = 0;
-	unsigned int ff_dir_handle = 0;
-	unsigned int loop_count = 0;
-	unsigned int mask_len, i;
-	int result;
-	struct smb_request *req;
-	unsigned char *name_buf;
-	static struct qstr star = {
-		.name	= "*",
-		.len	= 1,
-	};
-
-	lock_kernel();
-
-	/*
-	 * We always prefer unix style. Use info level 1 for older
-	 * servers that don't do 260.
-	 */
-	if (server->opt.capabilities & SMB_CAP_UNIX)
-		info_level = SMB_FIND_FILE_UNIX;
-	else if (server->opt.protocol < SMB_PROTOCOL_NT1)
-		info_level = 1;
-
-	result = -ENOMEM;
-	if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
-		goto out;
-	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-		goto out_name;
-	param = req->rq_buffer;
-
-	/*
-	 * Encode the initial path
-	 */
-	mask = param + 12;
-
-	result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
-	if (result <= 0)
-		goto out_free;
-	mask_len = result - 1;	/* mask_len is strlen, not #bytes */
-	result = 0;
-	first = 1;
-	VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
-
-	entries_seen = 2;
-	ff_eos = 0;
-
-	while (ff_eos == 0) {
-		loop_count += 1;
-		if (loop_count > 10) {
-			printk(KERN_WARNING "smb_proc_readdir_long: "
-			       "Looping in FIND_NEXT??\n");
-			result = -EIO;
-			break;
-		}
-
-		if (first != 0) {
-			command = TRANSACT2_FINDFIRST;
-			WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-			WSET(param, 2, max_matches);	/* max count */
-			WSET(param, 4, SMB_CLOSE_IF_END);
-			WSET(param, 6, info_level);
-			DSET(param, 8, 0);
-		} else {
-			command = TRANSACT2_FINDNEXT;
-
-			VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
-				ff_dir_handle, ff_lastname, mask_len, mask);
-
-			WSET(param, 0, ff_dir_handle);	/* search handle */
-			WSET(param, 2, max_matches);	/* max count */
-			WSET(param, 4, info_level);
-			DSET(param, 6, 0);
-			WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
-		}
-
-		req->rq_trans2_command = command;
-		req->rq_ldata = 0;
-		req->rq_data  = NULL;
-		req->rq_lparm = 12 + mask_len + 1;
-		req->rq_parm  = param;
-		req->rq_flags = 0;
-		result = smb_add_request(req);
-		if (result < 0) {
-			PARANOIA("error=%d, breaking\n", result);
-			break;
-		}
-
-		if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
-			/* a damn Win95 bug - sometimes it clags if you 
-			   ask it too fast */
-			schedule_timeout_interruptible(msecs_to_jiffies(200));
-			continue;
-                }
-
-		if (req->rq_rcls != 0) {
-			result = smb_errno(req);
-			PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
-				 mask, result, req->rq_rcls, req->rq_err);
-			break;
-		}
-
-		/* parse out some important return info */
-		if (first != 0) {
-			ff_dir_handle = WVAL(req->rq_parm, 0);
-			ff_searchcount = WVAL(req->rq_parm, 2);
-			ff_eos = WVAL(req->rq_parm, 4);
-			ff_lastname = WVAL(req->rq_parm, 8);
-		} else {
-			ff_searchcount = WVAL(req->rq_parm, 0);
-			ff_eos = WVAL(req->rq_parm, 2);
-			ff_lastname = WVAL(req->rq_parm, 6);
-		}
-
-		if (ff_searchcount == 0)
-			break;
-
-		/* Now we are ready to parse smb directory entries. */
-
-		/* point to the data bytes */
-		p = req->rq_data;
-		for (i = 0; i < ff_searchcount; i++) {
-			/* make sure we stay within the buffer */
-			if (p >= req->rq_data + req->rq_ldata) {
-				printk(KERN_ERR "smb_proc_readdir_long: "
-				       "dirent pointer outside buffer! "
-				       "%p  %d@%p\n",
-				       p, req->rq_ldata, req->rq_data);
-				result = -EIO; /* always a comm. error? */
-				goto out_free;
-			}
-
-			p = smb_decode_long_dirent(server, p, info_level,
-						   &qname, &fattr, name_buf);
-
-			/* ignore . and .. from the server */
-			if (entries_seen == 2 && qname.name[0] == '.') {
-				if (qname.len == 1)
-					continue;
-				if (qname.name[1] == '.' && qname.len == 2)
-					continue;
-			}
-
-			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-					    &qname, &fattr))
-				;	/* stop reading? */
-			entries_seen++;
-		}
-
-		VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
-
-		/*
-		 * We might need the lastname for continuations.
-		 *
-		 * Note that some servers (win95?) point to the filename and
-		 * others (NT4, Samba using NT1) to the dir entry. We assume
-		 * here that those who do not point to a filename do not need
-		 * this info to continue the listing.
-		 *
-		 * OS/2 needs this and talks infolevel 1.
-		 * NetApps want lastname with infolevel 260.
-		 * win2k want lastname with infolevel 260, and points to
-		 *       the record not to the name.
-		 * Samba+CifsUnixExt doesn't need lastname.
-		 *
-		 * Both are happy if we return the data they point to. So we do.
-		 * (FIXME: above is not true with win2k)
-		 */
-		mask_len = 0;
-		if (info_level != SMB_FIND_FILE_UNIX &&
-		    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
-			lastname = req->rq_data + ff_lastname;
-
-			switch (info_level) {
-			case 260:
-				mask_len = req->rq_ldata - ff_lastname;
-				break;
-			case 1:
-				/* lastname points to a length byte */
-				mask_len = *lastname++;
-				if (ff_lastname + 1 + mask_len > req->rq_ldata)
-					mask_len = req->rq_ldata - ff_lastname - 1;
-				break;
-			}
-
-			/*
-			 * Update the mask string for the next message.
-			 */
-			if (mask_len > 255)
-				mask_len = 255;
-			if (mask_len)
-				strncpy(mask, lastname, mask_len);
-		}
-		mask_len = strnlen(mask, mask_len);
-		VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
-			mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
-
-		first = 0;
-		loop_count = 0;
-	}
-
-out_free:
-	smb_rput(req);
-out_name:
-	kfree(name_buf);
-out:
-	unlock_kernel();
-	return result;
-}
-
-/*
- * This version uses the trans2 TRANSACT2_FINDFIRST message 
- * to get the attribute data.
- *
- * Bugs Noted:
- */
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-			struct smb_fattr *fattr)
-{
-	char *param, *mask;
-	__u16 date, time;
-	int mask_len, result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-	mask = param + 12;
-
-	mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
-	if (mask_len < 0) {
-		result = mask_len;
-		goto out_free;
-	}
-	VERBOSE("name=%s, len=%d\n", mask, mask_len);
-	WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-	WSET(param, 2, 1);	/* max count */
-	WSET(param, 4, 1);	/* close after this call */
-	WSET(param, 6, 1);	/* info_level */
-	DSET(param, 8, 0);
-
-	req->rq_trans2_command = TRANSACT2_FINDFIRST;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = 12 + mask_len;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	if (req->rq_rcls != 0) {
-		result = smb_errno(req);
-#ifdef SMBFS_PARANOIA
-		if (result != -ENOENT)
-			PARANOIA("error for %s, rcls=%d, err=%d\n",
-				 mask, req->rq_rcls, req->rq_err);
-#endif
-		goto out_free;
-	}
-	/* Make sure we got enough data ... */
-	result = -EINVAL;
-	if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
-		PARANOIA("bad result for %s, len=%d, count=%d\n",
-			 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
-		goto out_free;
-	}
-
-	/*
-	 * Decode the response into the fattr ...
-	 */
-	date = WVAL(req->rq_data, 0);
-	time = WVAL(req->rq_data, 2);
-	fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_ctime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 4);
-	time = WVAL(req->rq_data, 6);
-	fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_atime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 8);
-	time = WVAL(req->rq_data, 10);
-	fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_mtime.tv_nsec = 0;
-	VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-		mask, date, time, fattr->f_mtime.tv_sec);
-	fattr->f_size = DVAL(req->rq_data, 12);
-	/* ULONG allocation size */
-	fattr->attr = WVAL(req->rq_data, 20);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *fattr)
-{
-	int result;
-	char *p;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBgetatr, 0, 0);
-	result = smb_simple_encode_path(req, &p, dir, NULL);
-	if (result < 0)
- 		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
-		goto out_free;
-	fattr->attr    = WVAL(req->rq_header, smb_vwv0);
-	fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
-	fattr->f_mtime.tv_nsec = 0;
-	fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
-	fattr->f_ctime = fattr->f_mtime; 
-	fattr->f_atime = fattr->f_mtime; 
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk("getattr_core: %s/%s, mtime=%ld\n",
-	       DENTRY_PATH(dir), fattr->f_mtime);
-#endif
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Bugs Noted:
- * (1) Win 95 swaps the date and time fields in the standard info level.
- */
-static int
-smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
-			struct smb_request *req, int infolevel)
-{
-	char *p, *param;
-	int result;
-
-	param = req->rq_buffer;
-	WSET(param, 0, infolevel);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-	if (result < 0)
-		goto out;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_QPATHINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out;
-	if (req->rq_rcls != 0) {
-		VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
-			&param[6], result, req->rq_rcls, req->rq_err);
-		result = smb_errno(req);
-		goto out;
-	}
-	result = -ENOENT;
-	if (req->rq_ldata < 22) {
-		PARANOIA("not enough data for %s, len=%d\n",
-			 &param[6], req->rq_ldata);
-		goto out;
-	}
-
-	result = 0;
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
-			    struct smb_fattr *attr)
-{
-	u16 date, time;
-	int off_date = 0, off_time = 2;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
-	if (result < 0)
-		goto out_free;
-
-	/*
-	 * Kludge alert: Win 95 swaps the date and time field,
-	 * contrary to the CIFS docs and Win NT practice.
-	 */
-	if (server->mnt->flags & SMB_MOUNT_WIN95) {
-		off_date = 2;
-		off_time = 0;
-	}
-	date = WVAL(req->rq_data, off_date);
-	time = WVAL(req->rq_data, off_time);
-	attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_ctime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 4 + off_date);
-	time = WVAL(req->rq_data, 4 + off_time);
-	attr->f_atime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_atime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 8 + off_date);
-	time = WVAL(req->rq_data, 8 + off_time);
-	attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_mtime.tv_nsec = 0;
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
-	       DENTRY_PATH(dir), date, time, attr->f_mtime);
-#endif
-	attr->f_size = DVAL(req->rq_data, 12);
-	attr->attr = WVAL(req->rq_data, 20);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
-			    struct smb_fattr *attr)
-{
-	struct smb_request *req;
-	int result;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req,
-					 SMB_QUERY_FILE_ALL_INFO);
-	if (result < 0)
-		goto out_free;
-
-	attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
-	attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
-	attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
-	/* change (24) */
-	attr->attr = WVAL(req->rq_data, 32);
-	/* pad? (34) */
-	/* allocated size (40) */
-	attr->f_size = LVAL(req->rq_data, 48);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *attr)
-{
-	struct smb_request *req;
-	int result;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req,
-					 SMB_QUERY_FILE_UNIX_BASIC);
-	if (result < 0)
-		goto out_free;
-
-	smb_decode_unix_basic(attr, server, req->rq_data);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
-		    struct smb_fattr *attr)
-{
-	struct inode *inode = dir->d_inode;
-	int result;
-
-	/* FIXME: why not use the "all" version? */
-	result = smb_proc_getattr_trans2_std(server, dir, attr);
-	if (result < 0)
-		goto out;
-
-	/*
-	 * None of the getattr versions here can make win9x return the right
-	 * filesize if there are changes made to an open file.
-	 * A seek-to-end does return the right size, but we only need to do
-	 * that on files we have written.
-	 */
-	if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
-	    smb_is_open(inode))
-	{
-		__u16 fileid = SMB_I(inode)->fileid;
-		attr->f_size = smb_proc_seek(server, fileid, 2, 0);
-	}
-
-out:
-	return result;
-}
-
-static int
-smb_proc_ops_wait(struct smb_sb_info *server)
-{
-	int result;
-
-	result = wait_event_interruptible_timeout(server->conn_wq,
-				server->conn_complete, 30*HZ);
-
-	if (!result || signal_pending(current))
-		return -EIO;
-
-	return 0;
-}
-
-static int
-smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
-			  struct smb_fattr *fattr)
-{
-	int result;
-
-	if (smb_proc_ops_wait(server) < 0)
-		return -EIO;
-
-	smb_init_dirent(server, fattr);
-	result = server->ops->getattr(server, dir, fattr);
-	smb_finish_dirent(server, fattr);
-
-	return result;
-}
-
-static int
-smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
-		      struct smb_cache_control *ctl)
-{
-	struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
-
-	if (smb_proc_ops_wait(server) < 0)
-		return -EIO;
-
-	return server->ops->readdir(filp, dirent, filldir, ctl);
-}
-
-int
-smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dir);
-	int result;
-
-	smb_init_dirent(server, fattr);
-	result = server->ops->getattr(server, dir, fattr);
-	smb_finish_dirent(server, fattr);
-
-	return result;
-}
-
-
-/*
- * Because of bugs in the core protocol, we use this only to set
- * attributes. See smb_proc_settime() below for timestamp handling.
- *
- * Bugs Noted:
- * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
- * with an undocumented error (ERRDOS code 50). Setting
- * mtime to 0 allows the attributes to be set.
- * (2) The extra parameters following the name string aren't
- * in the CIFS docs, but seem to be necessary for operation.
- */
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-		      __u16 attr)
-{
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBsetatr, 8, 0);
-	WSET(req->rq_header, smb_vwv0, attr);
-	DSET(req->rq_header, smb_vwv1, 0); /* mtime */
-	WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
-	WSET(req->rq_header, smb_vwv4, 0);
-	WSET(req->rq_header, smb_vwv5, 0);
-	WSET(req->rq_header, smb_vwv6, 0);
-	WSET(req->rq_header, smb_vwv7, 0);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
-		result = -ENAMETOOLONG;
-		goto out_free;
-	}
-	*p++ = 4;
-	*p++ = 0;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, SMBsetatr, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Because of bugs in the trans2 setattr messages, we must set
- * attributes and timestamps separately. The core SMBsetatr
- * message seems to be the only reliable way to set attributes.
- */
-int
-smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dir);
-	int result;
-
-	VERBOSE("setting %s/%s, open=%d\n", 
-		DENTRY_PATH(dir), smb_is_open(dir->d_inode));
-	result = smb_proc_setattr_core(server, dir, fattr->attr);
-	return result;
-}
-
-/*
- * Sets the timestamps for an file open with write permissions.
- */
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-		      struct inode *inode, struct smb_fattr *fattr)
-{
-	__u16 date, time;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBsetattrE, 7, 0);
-	WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
-	/* We don't change the creation time */
-	WSET(req->rq_header, smb_vwv1, 0);
-	WSET(req->rq_header, smb_vwv2, 0);
-	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-	WSET(req->rq_header, smb_vwv3, date);
-	WSET(req->rq_header, smb_vwv4, time);
-	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-	WSET(req->rq_header, smb_vwv5, date);
-	WSET(req->rq_header, smb_vwv6, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
-	       date, time, fattr->f_mtime);
-#endif
-
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBsetattrE, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Bugs Noted:
- * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
- * set the file's attribute flags.
- */
-static int
-smb_proc_setattr_trans2(struct smb_sb_info *server,
-			struct dentry *dir, struct smb_fattr *fattr)
-{
-	__u16 date, time;
-	char *p, *param;
-	int result;
-	char data[26];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, 1);	/* Info level SMB_INFO_STANDARD */
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	WSET(data, 0, 0); /* creation time */
-	WSET(data, 2, 0);
-	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-	WSET(data, 4, date);
-	WSET(data, 6, time);
-	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-	WSET(data, 8, date);
-	WSET(data, 10, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
-	       DENTRY_PATH(dir), date, time, fattr->f_mtime);
-#endif
-	DSET(data, 12, 0); /* size */
-	DSET(data, 16, 0); /* blksize */
-	WSET(data, 20, 0); /* attr */
-	DSET(data, 22, 0); /* ULONG EA size */
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = 26;
-	req->rq_data  = data;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-	if (req->rq_rcls != 0)
-		result = smb_errno(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * ATTR_MODE      0x001
- * ATTR_UID       0x002
- * ATTR_GID       0x004
- * ATTR_SIZE      0x008
- * ATTR_ATIME     0x010
- * ATTR_MTIME     0x020
- * ATTR_CTIME     0x040
- * ATTR_ATIME_SET 0x080
- * ATTR_MTIME_SET 0x100
- * ATTR_FORCE     0x200	
- * ATTR_ATTR_FLAG 0x400
- *
- * major/minor should only be set by mknod.
- */
-int
-smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-		      unsigned int major, unsigned int minor)
-{
-	struct smb_sb_info *server = server_from_dentry(d);
-	u64 nttime;
-	char *p, *param;
-	int result;
-	char data[100];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	/* 0 L file size in bytes */
-	/* 8 L file size on disk in bytes (block count) */
-	/* 40 L uid */
-	/* 48 L gid */
-	/* 56 W file type enum */
-	/* 60 L devmajor */
-	/* 68 L devminor */
-	/* 76 L unique ID (inode) */
-	/* 84 L permissions */
-	/* 92 L link count */
-	LSET(data, 0, SMB_SIZE_NO_CHANGE);
-	LSET(data, 8, SMB_SIZE_NO_CHANGE);
-	LSET(data, 16, SMB_TIME_NO_CHANGE);
-	LSET(data, 24, SMB_TIME_NO_CHANGE);
-	LSET(data, 32, SMB_TIME_NO_CHANGE);
-	LSET(data, 40, SMB_UID_NO_CHANGE);
-	LSET(data, 48, SMB_GID_NO_CHANGE);
-	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
-	LSET(data, 60, major);
-	LSET(data, 68, minor);
-	LSET(data, 76, 0);
-	LSET(data, 84, SMB_MODE_NO_CHANGE);
-	LSET(data, 92, 0);
-
-	if (attr->ia_valid & ATTR_SIZE) {
-		LSET(data, 0, attr->ia_size);
-		LSET(data, 8, 0); /* can't set anyway */
-	}
-
-	/*
-	 * FIXME: check the conversion function it the correct one
-	 *
-	 * we can't set ctime but we might as well pass this to the server
-	 * and let it ignore it.
-	 */
-	if (attr->ia_valid & ATTR_CTIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_ctime);
-		LSET(data, 16, nttime);
-	}
-	if (attr->ia_valid & ATTR_ATIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_atime);
-		LSET(data, 24, nttime);
-	}
-	if (attr->ia_valid & ATTR_MTIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_mtime);
-		LSET(data, 32, nttime);
-	}
-	
-	if (attr->ia_valid & ATTR_UID) {
-		LSET(data, 40, attr->ia_uid);
-	}
-	if (attr->ia_valid & ATTR_GID) {
-		LSET(data, 48, attr->ia_gid); 
-	}
-	
-	if (attr->ia_valid & ATTR_MODE) {
-		LSET(data, 84, attr->ia_mode);
-	}
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = 100;
-	req->rq_data  = data;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-/*
- * Set the modify and access timestamps for a file.
- *
- * Incredibly enough, in all of SMB there is no message to allow
- * setting both attributes and timestamps at once. 
- *
- * Bugs Noted:
- * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
- * with info level 1 (INFO_STANDARD).
- * (2) Win 95 seems not to support setting directory timestamps.
- * (3) Under the core protocol apparently the only way to set the
- * timestamp is to open and close the file.
- */
-int
-smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode *inode = dentry->d_inode;
-	int result;
-
-	VERBOSE("setting %s/%s, open=%d\n",
-		DENTRY_PATH(dentry), smb_is_open(inode));
-
-	/* setting the time on a Win95 server fails (tridge) */
-	if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
-	    !(server->mnt->flags & SMB_MOUNT_WIN95)) {
-		if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
-			result = smb_proc_setattr_ext(server, inode, fattr);
-		else
-			result = smb_proc_setattr_trans2(server, dentry, fattr);
-	} else {
-		/*
-		 * Fail silently on directories ... timestamp can't be set?
-		 */
-		result = 0;
-		if (S_ISREG(inode->i_mode)) {
-			/*
-			 * Set the mtime by opening and closing the file.
-			 * Note that the file is opened read-only, but this
-			 * still allows us to set the date (tridge)
-			 */
-			result = -EACCES;
-			if (!smb_is_open(inode))
-				smb_proc_open(server, dentry, SMB_O_RDONLY);
-			if (smb_is_open(inode)) {
-				inode->i_mtime = fattr->f_mtime;
-				result = smb_proc_close_inode(server, inode);
-			}
-		}
-	}
-
-	return result;
-}
-
-int
-smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
-{
-	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
-	int result;
-	char *p;
-	long unit;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBdskattr, 0, 0);
-	if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
-		goto out_free;
-	p = SMB_VWV(req->rq_header);
-	unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
-	attr->f_blocks = WVAL(p, 0) * unit;
-	attr->f_bsize  = SMB_ST_BLKSIZE;
-	attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
-		   char *buffer, int len)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_QPATHINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-		&param[6], result, req->rq_rcls, req->rq_err);
-
-	/* copy data up to the \0 or buffer length */
-	result = len;
-	if (req->rq_ldata < len)
-		result = req->rq_ldata;
-	strncpy(buffer, req->rq_data, result);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-/*
- * Create a symlink object called dentry which points to oldpath.
- * Samba does not permit dangling links but returns a suitable error message.
- */
-int
-smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
-		 const char *oldpath)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = strlen(oldpath) + 1;
-	req->rq_data  = (char *) oldpath;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-		&param[6], result, req->rq_rcls, req->rq_err);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Create a hard link object called new_dentry which points to dentry.
- */
-int
-smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
-	      struct dentry *new_dentry)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
-				 new_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	/* Grr, pointless separation of parameters and data ... */
-	req->rq_data = p;
-	req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
-					dentry, NULL);
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-	       &param[6], result, req->rq_rcls, req->rq_err);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server)
-{
-	int result;
-	int major, minor;
-	u64 caps;
-	char param[2];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 100)))
-		goto out;
-
-	WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
-
-	req->rq_trans2_command = TRANSACT2_QFSINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = 2;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	if (req->rq_ldata < 12) {
-		PARANOIA("Not enough data\n");
-		goto out_free;
-	}
-	major = WVAL(req->rq_data, 0);
-	minor = WVAL(req->rq_data, 2);
-
-	DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
-	       major, minor);
-	/* FIXME: verify that we are ok with this major/minor? */
-
-	caps = LVAL(req->rq_data, 4);
-	DEBUG1("Server capabilities 0x%016llx\n", caps);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src)
-{
-	memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
-}
-
-/* < LANMAN2 */
-static struct smb_ops smb_ops_core =
-{
-	.read		= smb_proc_read,
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_short,
-	.getattr	= smb_proc_getattr_core,
-	.truncate	= smb_proc_trunc32,
-};
-
-/* LANMAN2, OS/2, others? */
-static struct smb_ops smb_ops_os2 =
-{
-	.read		= smb_proc_read,
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_trans2_std,
-	.truncate	= smb_proc_trunc32,
-};
-
-/* Win95, and possibly some NetApp versions too */
-static struct smb_ops smb_ops_win95 =
-{
-	.read		= smb_proc_read,    /* does not support 12word readX */
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_95,
-	.truncate	= smb_proc_trunc95,
-};
-
-/* Samba, NT4 and NT5 */
-static struct smb_ops smb_ops_winNT =
-{
-	.read		= smb_proc_readX,
-	.write		= smb_proc_writeX,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_trans2_all,
-	.truncate	= smb_proc_trunc64,
-};
-
-/* Samba w/ unix extensions. Others? */
-static struct smb_ops smb_ops_unix =
-{
-	.read		= smb_proc_readX,
-	.write		= smb_proc_writeX,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_unix,
-	/* FIXME: core/ext/time setattr needs to be cleaned up! */
-	/* .setattr	= smb_proc_setattr_unix, */
-	.truncate	= smb_proc_trunc64,
-};
-
-/* Place holder until real ops are in place */
-static struct smb_ops smb_ops_null =
-{
-	.readdir	= smb_proc_readdir_null,
-	.getattr	= smb_proc_getattr_null,
-};
-
-void smb_install_null_ops(struct smb_ops *ops)
-{
-	install_ops(ops, &smb_ops_null);
-}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e6..000000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
- */
-
-struct smb_request;
-struct sock;
-struct statfs;
-
-/* proc.c */
-extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
-extern __u32 smb_len(__u8 *p);
-extern int smb_get_rsize(struct smb_sb_info *server);
-extern int smb_get_wsize(struct smb_sb_info *server);
-extern int smb_errno(struct smb_request *req);
-extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
-extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
-extern int smb_open(struct dentry *dentry, int wish);
-extern int smb_close(struct inode *ino);
-extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
-extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
-extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
-extern int smb_proc_mkdir(struct dentry *dentry);
-extern int smb_proc_rmdir(struct dentry *dentry);
-extern int smb_proc_unlink(struct dentry *dentry);
-extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
-extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-				 struct super_block *sb);
-extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
-extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
-extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
-extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
-extern void smb_install_null_ops(struct smb_ops *ops);
-/* dir.c */
-extern const struct file_operations smb_dir_operations;
-extern const struct inode_operations smb_dir_inode_operations;
-extern const struct inode_operations smb_dir_inode_operations_unix;
-extern void smb_new_dentry(struct dentry *dentry);
-extern void smb_renew_times(struct dentry *dentry);
-/* cache.c */
-extern void smb_invalid_dir_cache(struct inode *dir);
-extern void smb_invalidate_dircache_entries(struct dentry *parent);
-extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
-extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
-/* sock.c */
-extern void smb_data_ready(struct sock *sk, int len);
-extern int smb_valid_socket(struct inode *inode);
-extern void smb_close_socket(struct smb_sb_info *server);
-extern int smb_recv_available(struct smb_sb_info *server);
-extern int smb_receive_header(struct smb_sb_info *server);
-extern int smb_receive_drop(struct smb_sb_info *server);
-extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
-extern int smb_send_request(struct smb_request *req);
-/* inode.c */
-extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
-extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_invalidate_inodes(struct smb_sb_info *server);
-extern int smb_revalidate_inode(struct dentry *dentry);
-extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
-/* file.c */
-extern const struct address_space_operations smb_file_aops;
-extern const struct file_operations smb_file_operations;
-extern const struct inode_operations smb_file_inode_operations;
-/* ioctl.c */
-extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-/* smbiod.c */
-extern void smbiod_wake_up(void);
-extern int smbiod_register_server(struct smb_sb_info *server);
-extern void smbiod_unregister_server(struct smb_sb_info *server);
-extern void smbiod_flush(struct smb_sb_info *server);
-extern int smbiod_retry(struct smb_sb_info *server);
-/* request.c */
-extern int smb_init_request_cache(void);
-extern void smb_destroy_request_cache(void);
-extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
-extern void smb_rput(struct smb_request *req);
-extern int smb_add_request(struct smb_request *req);
-extern int smb_request_send_server(struct smb_sb_info *server);
-extern int smb_request_recv(struct smb_sb_info *server);
-/* symlink.c */
-extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
-extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e862..000000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  request.c
- *
- *  Copyright (C) 2001 by Urban Widmark
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-
-/* #define SMB_SLAB_DEBUG	(SLAB_RED_ZONE | SLAB_POISON) */
-#define SMB_SLAB_DEBUG	0
-
-/* cache for request structures */
-static struct kmem_cache *req_cachep;
-
-static int smb_request_send_req(struct smb_request *req);
-
-/*
-  /proc/slabinfo:
-  name, active, num, objsize, active_slabs, num_slaps, #pages
-*/
-
-
-int smb_init_request_cache(void)
-{
-	req_cachep = kmem_cache_create("smb_request",
-				       sizeof(struct smb_request), 0,
-				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-				       NULL);
-	if (req_cachep == NULL)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void smb_destroy_request_cache(void)
-{
-	kmem_cache_destroy(req_cachep);
-}
-
-/*
- * Allocate and initialise a request structure
- */
-static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
-						int bufsize)
-{
-	struct smb_request *req;
-	unsigned char *buf = NULL;
-
-	req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
-	VERBOSE("allocating request: %p\n", req);
-	if (!req)
-		goto out;
-
-	if (bufsize > 0) {
-		buf = kmalloc(bufsize, GFP_NOFS);
-		if (!buf) {
-			kmem_cache_free(req_cachep, req);
-			return NULL;
-		}
-	}
-
-	req->rq_buffer = buf;
-	req->rq_bufsize = bufsize;
-	req->rq_server = server;
-	init_waitqueue_head(&req->rq_wait);
-	INIT_LIST_HEAD(&req->rq_queue);
-	atomic_set(&req->rq_count, 1);
-
-out:
-	return req;
-}
-
-struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
-{
-	struct smb_request *req = NULL;
-
-	for (;;) {
-		atomic_inc(&server->nr_requests);
-		if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
-			req = smb_do_alloc_request(server, bufsize);
-			if (req != NULL)
-				break;
-		}
-
-#if 0
-		/*
-		 * Try to free up at least one request in order to stay
-		 * below the hard limit
-		 */
-                if (nfs_try_to_free_pages(server))
-			continue;
-
-		if (fatal_signal_pending(current))
-			return ERR_PTR(-ERESTARTSYS);
-		current->policy = SCHED_YIELD;
-		schedule();
-#else
-		/* FIXME: we want something like nfs does above, but that
-		   requires changes to all callers and can wait. */
-		break;
-#endif
-	}
-	return req;
-}
-
-static void smb_free_request(struct smb_request *req)
-{
-	atomic_dec(&req->rq_server->nr_requests);
-	if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-		kfree(req->rq_buffer);
-	kfree(req->rq_trans2buffer);
-	kmem_cache_free(req_cachep, req);
-}
-
-/*
- * What prevents a rget to race with a rput? The count must never drop to zero
- * while it is in use. Only rput if it is ok that it is free'd.
- */
-static void smb_rget(struct smb_request *req)
-{
-	atomic_inc(&req->rq_count);
-}
-void smb_rput(struct smb_request *req)
-{
-	if (atomic_dec_and_test(&req->rq_count)) {
-		list_del_init(&req->rq_queue);
-		smb_free_request(req);
-	}
-}
-
-/* setup to receive the data part of the SMB */
-static int smb_setup_bcc(struct smb_request *req)
-{
-	int result = 0;
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-
-	if (req->rq_rlen > req->rq_bufsize) {
-		PARANOIA("Packet too large %d > %d\n",
-			 req->rq_rlen, req->rq_bufsize);
-		return -ENOBUFS;
-	}
-
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = req->rq_rlen;
-	req->rq_iovlen = 1;
-
-	return result;
-}
-
-/*
- * Prepare a "normal" request structure.
- */
-static int smb_setup_request(struct smb_request *req)
-{
-	int len = smb_len(req->rq_header) + 4;
-	req->rq_slen = len;
-
-	/* if we expect a data part in the reply we set the iov's to read it */
-	if (req->rq_resp_bcc)
-		req->rq_setup_read = smb_setup_bcc;
-
-	/* This tries to support re-using the same request */
-	req->rq_bytes_sent = 0;
-	req->rq_rcls = 0;
-	req->rq_err = 0;
-	req->rq_errno = 0;
-	req->rq_fragment = 0;
-	kfree(req->rq_trans2buffer);
-	req->rq_trans2buffer = NULL;
-
-	return 0;
-}
-
-/*
- * Prepare a transaction2 request structure
- */
-static int smb_setup_trans2request(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	int mparam, mdata;
-	static unsigned char padding[4];
-
-	/* I know the following is very ugly, but I want to build the
-	   smb packet as efficiently as possible. */
-
-	const int smb_parameters = 15;
-	const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-	const int oparam = ALIGN(header + 3, sizeof(u32));
-	const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
-	const int bcc = (req->rq_data ? odata + req->rq_ldata :
-					oparam + req->rq_lparm) - header;
-
-	if ((bcc + oparam) > server->opt.max_xmit)
-		return -ENOMEM;
-	smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
-
-	/*
-	 * max parameters + max data + max setup == bufsize to make NT4 happy
-	 * and not abort the transfer or split into multiple responses. It also
-	 * makes smbfs happy as handling packets larger than the buffer size
-	 * is extra work.
-	 *
-	 * OS/2 is probably going to hate me for this ...
-	 */
-	mparam = SMB_TRANS2_MAX_PARAM;
-	mdata = req->rq_bufsize - mparam;
-
-	mdata = server->opt.max_xmit - mparam - 100;
-	if (mdata < 1024) {
-		mdata = 1024;
-		mparam = 20;
-	}
-
-#if 0
-	/* NT/win2k has ~4k max_xmit, so with this we request more than it wants
-	   to return as one SMB. Useful for testing the fragmented trans2
-	   handling. */
-	mdata = 8192;
-#endif
-
-	WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
-	WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
-	WSET(req->rq_header, smb_mprcnt, mparam);
-	WSET(req->rq_header, smb_mdrcnt, mdata);
-	WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
-	WSET(req->rq_header, smb_flags, 0);
-	DSET(req->rq_header, smb_timeout, 0);
-	WSET(req->rq_header, smb_pscnt, req->rq_lparm);
-	WSET(req->rq_header, smb_psoff, oparam - 4);
-	WSET(req->rq_header, smb_dscnt, req->rq_ldata);
-	WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
-	*(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
-	*(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
-	WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
-
-	req->rq_iovlen = 2;
-	req->rq_iov[0].iov_base = (void *) req->rq_header;
-	req->rq_iov[0].iov_len = oparam;
-	req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
-	req->rq_iov[1].iov_len = req->rq_lparm;
-	req->rq_slen = oparam + req->rq_lparm;
-
-	if (req->rq_data) {
-		req->rq_iovlen += 2;
-		req->rq_iov[2].iov_base = padding;
-		req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
-		req->rq_iov[3].iov_base = req->rq_data;
-		req->rq_iov[3].iov_len = req->rq_ldata;
-		req->rq_slen = odata + req->rq_ldata;
-	}
-
-	/* always a data part for trans2 replies */
-	req->rq_setup_read = smb_setup_bcc;
-
-	return 0;
-}
-
-/*
- * Add a request and tell smbiod to process it
- */
-int smb_add_request(struct smb_request *req)
-{
-	long timeleft;
-	struct smb_sb_info *server = req->rq_server;
-	int result = 0;
-
-	smb_setup_request(req);
-	if (req->rq_trans2_command) {
-		if (req->rq_buffer == NULL) {
-			PARANOIA("trans2 attempted without response buffer!\n");
-			return -EIO;
-		}
-		result = smb_setup_trans2request(req);
-	}
-	if (result < 0)
-		return result;
-
-#ifdef SMB_DEBUG_PACKET_SIZE
-	add_xmit_stats(req);
-#endif
-
-	/* add 'req' to the queue of requests */
-	if (smb_lock_server_interruptible(server))
-		return -EINTR;
-
-	/*
-	 * Try to send the request as the process. If that fails we queue the
-	 * request and let smbiod send it later.
-	 */
-
-	/* FIXME: each server has a number on the maximum number of parallel
-	   requests. 10, 50 or so. We should not allow more requests to be
-	   active. */
-	if (server->mid > 0xf000)
-		server->mid = 0;
-	req->rq_mid = server->mid++;
-	WSET(req->rq_header, smb_mid, req->rq_mid);
-
-	result = 0;
-	if (server->state == CONN_VALID) {
-		if (list_empty(&server->xmitq))
-			result = smb_request_send_req(req);
-		if (result < 0) {
-			/* Connection lost? */
-			server->conn_error = result;
-			server->state = CONN_INVALID;
-		}
-	}
-	if (result != 1)
-		list_add_tail(&req->rq_queue, &server->xmitq);
-	smb_rget(req);
-
-	if (server->state != CONN_VALID)
-		smbiod_retry(server);
-
-	smb_unlock_server(server);
-
-	smbiod_wake_up();
-
-	timeleft = wait_event_interruptible_timeout(req->rq_wait,
-				    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
-	if (!timeleft || signal_pending(current)) {
-		/*
-		 * On timeout or on interrupt we want to try and remove the
-		 * request from the recvq/xmitq.
-		 * First check if the request is still part of a queue. (May
-		 * have been removed by some error condition)
-		 */
-		smb_lock_server(server);
-		if (!list_empty(&req->rq_queue)) {
-			list_del_init(&req->rq_queue);
-			smb_rput(req);
-		}
-		smb_unlock_server(server);
-	}
-
-	if (!timeleft) {
-		PARANOIA("request [%p, mid=%d] timed out!\n",
-			 req, req->rq_mid);
-		VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
-		VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
-		VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
-		VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
-		VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
-		VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
-		VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
-		VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
-
-		req->rq_rcls = ERRSRV;
-		req->rq_err  = ERRtimeout;
-
-		/* Just in case it was "stuck" */
-		smbiod_wake_up();
-	}
-	VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
-
-	if (req->rq_rcls != 0)
-		req->rq_errno = smb_errno(req);
-	if (signal_pending(current))
-		req->rq_errno = -ERESTARTSYS;
-	return req->rq_errno;
-}
-
-/*
- * Send a request and place it on the recvq if successfully sent.
- * Must be called with the server lock held.
- */
-static int smb_request_send_req(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	int result;
-
-	if (req->rq_bytes_sent == 0) {
-		WSET(req->rq_header, smb_tid, server->opt.tid);
-		WSET(req->rq_header, smb_pid, 1);
-		WSET(req->rq_header, smb_uid, server->opt.server_uid);
-	}
-
-	result = smb_send_request(req);
-	if (result < 0 && result != -EAGAIN)
-		goto out;
-
-	result = 0;
-	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
-		goto out;
-
-	list_move_tail(&req->rq_queue, &server->recvq);
-	result = 1;
-out:
-	return result;
-}
-
-/*
- * Sends one request for this server. (smbiod)
- * Must be called with the server lock held.
- * Returns: <0 on error
- *           0 if no request could be completely sent
- *           1 if all data for one request was sent
- */
-int smb_request_send_server(struct smb_sb_info *server)
-{
-	struct list_head *head;
-	struct smb_request *req;
-	int result;
-
-	if (server->state != CONN_VALID)
-		return 0;
-
-	/* dequeue first request, if any */
-	req = NULL;
-	head = server->xmitq.next;
-	if (head != &server->xmitq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-	}
-	if (!req)
-		return 0;
-
-	result = smb_request_send_req(req);
-	if (result < 0) {
-		server->conn_error = result;
-		list_move(&req->rq_queue, &server->xmitq);
-		result = -EIO;
-		goto out;
-	}
-
-out:
-	return result;
-}
-
-/*
- * Try to find a request matching this "mid". Typically the first entry will
- * be the matching one.
- */
-static struct smb_request *find_request(struct smb_sb_info *server, int mid)
-{
-	struct list_head *tmp;
-	struct smb_request *req = NULL;
-
-	list_for_each(tmp, &server->recvq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		if (req->rq_mid == mid) {
-			break;
-		}
-		req = NULL;
-	}
-
-	if (!req) {
-		VERBOSE("received reply with mid %d but no request!\n",
-			WVAL(server->header, smb_mid));
-		server->rstate = SMB_RECV_DROP;
-	}
-
-	return req;
-}
-
-/*
- * Called when we have read the smb header and believe this is a response.
- */
-static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
-{
-	int hdrlen, wct;
-
-	memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
-
-	wct = *(req->rq_header + smb_wct);
-	if (wct > 20) {	
-		PARANOIA("wct too large, %d > 20\n", wct);
-		server->rstate = SMB_RECV_DROP;
-		return 0;
-	}
-
-	req->rq_resp_wct = wct;
-	hdrlen = SMB_HEADER_LEN + wct*2 + 2;
-	VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
-
-	req->rq_bytes_recvd = SMB_HEADER_LEN;
-	req->rq_rlen = hdrlen;
-	req->rq_iov[0].iov_base = req->rq_header;
-	req->rq_iov[0].iov_len  = hdrlen;
-	req->rq_iovlen = 1;
-	server->rstate = SMB_RECV_PARAM;
-
-#ifdef SMB_DEBUG_PACKET_SIZE
-	add_recv_stats(smb_len(server->header));
-#endif
-	return 0;
-}
-
-/*
- * Reads the SMB parameters
- */
-static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
-{
-	int result;
-
-	result = smb_receive(server, req);
-	if (result < 0)
-		return result;
-	if (req->rq_bytes_recvd < req->rq_rlen)
-		return 0;
-
-	VERBOSE("result: %d   smb_bcc:  %04x\n", result,
-		WVAL(req->rq_header, SMB_HEADER_LEN +
-		     (*(req->rq_header + smb_wct) * 2)));
-
-	result = 0;
-	req->rq_iov[0].iov_base = NULL;
-	req->rq_rlen = 0;
-	if (req->rq_callback)
-		req->rq_callback(req);
-	else if (req->rq_setup_read)
-		result = req->rq_setup_read(req);
-	if (result < 0) {
-		server->rstate = SMB_RECV_DROP;
-		return result;
-	}
-
-	server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
-
-	req->rq_bytes_recvd = 0;	// recvd out of the iov
-
-	VERBOSE("rlen: %d\n", req->rq_rlen);
-	if (req->rq_rlen < 0) {
-		PARANOIA("Parameters read beyond end of packet!\n");
-		server->rstate = SMB_RECV_END;
-		return -EIO;
-	}
-	return 0;
-}
-
-/*
- * Reads the SMB data
- */
-static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
-{
-	int result;
-
-	result = smb_receive(server, req);
-	if (result < 0)
-		goto out;
-	if (req->rq_bytes_recvd < req->rq_rlen)
-		goto out;
-	server->rstate = SMB_RECV_END;
-out:
-	VERBOSE("result: %d\n", result);
-	return result;
-}
-
-/*
- * Receive a transaction2 response
- * Return: 0 if the response has been fully read
- *         1 if there are further "fragments" to read
- *        <0 if there is an error
- */
-static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
-{
-	unsigned char *inbuf;
-	unsigned int parm_disp, parm_offset, parm_count, parm_tot;
-	unsigned int data_disp, data_offset, data_count, data_tot;
-	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-
-	VERBOSE("handling trans2\n");
-
-	inbuf = req->rq_header;
-	data_tot    = WVAL(inbuf, smb_tdrcnt);
-	parm_tot    = WVAL(inbuf, smb_tprcnt);
-	parm_disp   = WVAL(inbuf, smb_prdisp);
-	parm_offset = WVAL(inbuf, smb_proff);
-	parm_count  = WVAL(inbuf, smb_prcnt);
-	data_disp   = WVAL(inbuf, smb_drdisp);
-	data_offset = WVAL(inbuf, smb_droff);
-	data_count  = WVAL(inbuf, smb_drcnt);
-
-	/* Modify offset for the split header/buffer we use */
-	if (data_count || data_offset) {
-		if (unlikely(data_offset < hdrlen))
-			goto out_bad_data;
-		else
-			data_offset -= hdrlen;
-	}
-	if (parm_count || parm_offset) {
-		if (unlikely(parm_offset < hdrlen))
-			goto out_bad_parm;
-		else
-			parm_offset -= hdrlen;
-	}
-
-	if (parm_count == parm_tot && data_count == data_tot) {
-		/*
-		 * This packet has all the trans2 data.
-		 *
-		 * We setup the request so that this will be the common
-		 * case. It may be a server error to not return a
-		 * response that fits.
-		 */
-		VERBOSE("single trans2 response  "
-			"dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-			data_count, parm_count,
-			data_offset, parm_offset);
-		req->rq_ldata = data_count;
-		req->rq_lparm = parm_count;
-		req->rq_data = req->rq_buffer + data_offset;
-		req->rq_parm = req->rq_buffer + parm_offset;
-		if (unlikely(parm_offset + parm_count > req->rq_rlen))
-			goto out_bad_parm;
-		if (unlikely(data_offset + data_count > req->rq_rlen))
-			goto out_bad_data;
-		return 0;
-	}
-
-	VERBOSE("multi trans2 response  "
-		"frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-		req->rq_fragment,
-		data_count, parm_count,
-		data_offset, parm_offset);
-
-	if (!req->rq_fragment) {
-		int buf_len;
-
-		/* We got the first trans2 fragment */
-		req->rq_fragment = 1;
-		req->rq_total_data = data_tot;
-		req->rq_total_parm = parm_tot;
-		req->rq_ldata = 0;
-		req->rq_lparm = 0;
-
-		buf_len = data_tot + parm_tot;
-		if (buf_len > SMB_MAX_PACKET_SIZE)
-			goto out_too_long;
-
-		req->rq_trans2bufsize = buf_len;
-		req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
-		if (!req->rq_trans2buffer)
-			goto out_no_mem;
-
-		req->rq_parm = req->rq_trans2buffer;
-		req->rq_data = req->rq_trans2buffer + parm_tot;
-	} else if (unlikely(req->rq_total_data < data_tot ||
-			    req->rq_total_parm < parm_tot))
-		goto out_data_grew;
-
-	if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
-		     parm_offset + parm_count > req->rq_rlen))
-		goto out_bad_parm;
-	if (unlikely(data_disp + data_count > req->rq_total_data ||
-		     data_offset + data_count > req->rq_rlen))
-		goto out_bad_data;
-
-	inbuf = req->rq_buffer;
-	memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
-	memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
-
-	req->rq_ldata += data_count;
-	req->rq_lparm += parm_count;
-
-	/*
-	 * Check whether we've received all of the data. Note that
-	 * we use the packet totals -- total lengths might shrink!
-	 */
-	if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
-		req->rq_ldata = data_tot;
-		req->rq_lparm = parm_tot;
-		return 0;
-	}
-	return 1;
-
-out_too_long:
-	printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
-		data_tot, parm_tot);
-	goto out_EIO;
-out_no_mem:
-	printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
-	       req->rq_trans2bufsize);
-	req->rq_errno = -ENOMEM;
-	goto out;
-out_data_grew:
-	printk(KERN_ERR "smb_trans2: data/params grew!\n");
-	goto out_EIO;
-out_bad_parm:
-	printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-	       parm_disp, parm_count, parm_tot, parm_offset);
-	goto out_EIO;
-out_bad_data:
-	printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-	       data_disp, data_count, data_tot, data_offset);
-out_EIO:
-	req->rq_errno = -EIO;
-out:
-	return req->rq_errno;
-}
-
-/*
- * State machine for receiving responses. We handle the fact that we can't
- * read the full response in one try by having states telling us how much we
- * have read.
- *
- * Must be called with the server lock held (only called from smbiod).
- *
- * Return: <0 on error
- */
-int smb_request_recv(struct smb_sb_info *server)
-{
-	struct smb_request *req = NULL;
-	int result = 0;
-
-	if (smb_recv_available(server) <= 0)
-		return 0;
-
-	VERBOSE("state: %d\n", server->rstate);
-	switch (server->rstate) {
-	case SMB_RECV_DROP:
-		result = smb_receive_drop(server);
-		if (result < 0)
-			break;
-		if (server->rstate == SMB_RECV_DROP)
-			break;
-		server->rstate = SMB_RECV_START;
-		/* fallthrough */
-	case SMB_RECV_START:
-		server->smb_read = 0;
-		server->rstate = SMB_RECV_HEADER;
-		/* fallthrough */
-	case SMB_RECV_HEADER:
-		result = smb_receive_header(server);
-		if (result < 0)
-			break;
-		if (server->rstate == SMB_RECV_HEADER)
-			break;
-		if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
-			server->rstate = SMB_RECV_REQUEST;
-			break;
-		}
-		if (server->rstate != SMB_RECV_HCOMPLETE)
-			break;
-		/* fallthrough */
-	case SMB_RECV_HCOMPLETE:
-		req = find_request(server, WVAL(server->header, smb_mid));
-		if (!req)
-			break;
-		smb_init_request(server, req);
-		req->rq_rcls = *(req->rq_header + smb_rcls);
-		req->rq_err  = WVAL(req->rq_header, smb_err);
-		if (server->rstate != SMB_RECV_PARAM)
-			break;
-		/* fallthrough */
-	case SMB_RECV_PARAM:
-		if (!req)
-			req = find_request(server,WVAL(server->header,smb_mid));
-		if (!req)
-			break;
-		result = smb_recv_param(server, req);
-		if (result < 0)
-			break;
-		if (server->rstate != SMB_RECV_DATA)
-			break;
-		/* fallthrough */
-	case SMB_RECV_DATA:
-		if (!req)
-			req = find_request(server,WVAL(server->header,smb_mid));
-		if (!req)
-			break;
-		result = smb_recv_data(server, req);
-		if (result < 0)
-			break;
-		break;
-
-		/* We should never be called with any of these states */
-	case SMB_RECV_END:
-	case SMB_RECV_REQUEST:
-		BUG();
-	}
-
-	if (result < 0) {
-		/* We saw an error */
-		return result;
-	}
-
-	if (server->rstate != SMB_RECV_END)
-		return 0;
-
-	result = 0;
-	if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
-		result = smb_recv_trans2(server, req);
-
-	/*
-	 * Response completely read. Drop any extra bytes sent by the server.
-	 * (Yes, servers sometimes add extra bytes to responses)
-	 */
-	VERBOSE("smb_len: %d   smb_read: %d\n",
-		server->smb_len, server->smb_read);
-	if (server->smb_read < server->smb_len)
-		smb_receive_drop(server);
-
-	server->rstate = SMB_RECV_START;
-
-	if (!result) {
-		list_del_init(&req->rq_queue);
-		req->rq_flags |= SMB_REQ_RECEIVED;
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-	return 0;
-}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c9..000000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-#include <linux/wait.h>
-
-struct smb_request {
-	struct list_head rq_queue;	/* recvq or xmitq for the server */
-
-	atomic_t rq_count;
-
-	wait_queue_head_t rq_wait;
-	int rq_flags;
-	int rq_mid;	/* multiplex ID, set by request.c */
-
-	struct smb_sb_info *rq_server;
-
-	/* header + word count + parameter words + byte count */
-	unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
-
-	int rq_bufsize;
-	unsigned char *rq_buffer;
-
-	/* FIXME: this is not good enough for merging IO requests. */
-	unsigned char *rq_page;
-	int rq_rsize;
-
-	int rq_resp_wct;
-	int rq_resp_bcc;
-
-	int rq_rlen;
-	int rq_bytes_recvd;
-
-	int rq_slen;
-	int rq_bytes_sent;
-
-	int rq_iovlen;
-	struct kvec rq_iov[4];
-
-	int (*rq_setup_read) (struct smb_request *);
-	void (*rq_callback) (struct smb_request *);
-
-	/* ------ trans2 stuff ------ */
-
-	u16 rq_trans2_command;	/* 0 if not a trans2 request */
-	unsigned int rq_ldata;
-	unsigned char *rq_data;
-	unsigned int rq_lparm;
-	unsigned char *rq_parm;
-
-	int rq_fragment;
-	u32 rq_total_data;
-	u32 rq_total_parm;
-	int rq_trans2bufsize;
-	unsigned char *rq_trans2buffer;
-
-	/* ------ response ------ */
-
-	unsigned short rq_rcls;
-	unsigned short rq_err;
-	int rq_errno;
-};
-
-#define SMB_REQ_STATIC		0x0001	/* rq_buffer is static */
-#define SMB_REQ_NORETRY		0x0002	/* request is invalid after retry */
-
-#define SMB_REQ_TRANSMITTED	0x4000	/* all data has been sent */
-#define SMB_REQ_RECEIVED	0x8000	/* reply received, smbiod is done */
-
-#define xSMB_REQ_NOREPLY	0x0004	/* we don't want the reply (if any) */
-#define xSMB_REQ_NORECEIVER	0x0008	/* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd755..000000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Defines some debug macros for smbfs.
- */
-
-/* This makes a dentry parent/child name pair. Useful for debugging printk's */
-#define DENTRY_PATH(dentry) \
-	(dentry)->d_parent->d_name.name,(dentry)->d_name.name
-
-/*
- * safety checks that should never happen ???
- * these are normally enabled.
- */
-#ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
-#else
-# define PARANOIA(f, a...) do { ; } while(0)
-#endif
-
-/* lots of debug messages */
-#ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-# define VERBOSE(f, a...) do { ; } while(0)
-#endif
-
-/*
- * "normal" debug messages, but not with a normal DEBUG define ... way
- * too common name.
- */
-#ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-#define DEBUG1(f, a...) do { ; } while(0)
-#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10a..000000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  smbiod.c
- *
- *  Copyright (C) 2000, Charles Loep / Corel Corp.
- *  Copyright (C) 2001, Urban Widmark
- */
-
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/kthread.h>
-#include <net/ip.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-
-enum smbiod_state {
-	SMBIOD_DEAD,
-	SMBIOD_STARTING,
-	SMBIOD_RUNNING,
-};
-
-static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static struct task_struct *smbiod_thread;
-static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
-static LIST_HEAD(smb_servers);
-static DEFINE_SPINLOCK(servers_lock);
-
-#define SMBIOD_DATA_READY	(1<<0)
-static unsigned long smbiod_flags;
-
-static int smbiod(void *);
-static int smbiod_start(void);
-
-/*
- * called when there's work for us to do
- */
-void smbiod_wake_up(void)
-{
-	if (smbiod_state == SMBIOD_DEAD)
-		return;
-	set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-	wake_up_interruptible(&smbiod_wait);
-}
-
-/*
- * start smbiod if none is running
- */
-static int smbiod_start(void)
-{
-	struct task_struct *tsk;
-	int err = 0;
-
-	if (smbiod_state != SMBIOD_DEAD)
-		return 0;
-	smbiod_state = SMBIOD_STARTING;
-	__module_get(THIS_MODULE);
-	spin_unlock(&servers_lock);
-	tsk = kthread_run(smbiod, NULL, "smbiod");
-	if (IS_ERR(tsk)) {
-		err = PTR_ERR(tsk);
-		module_put(THIS_MODULE);
-	}
-
-	spin_lock(&servers_lock);
-	if (err < 0) {
-		smbiod_state = SMBIOD_DEAD;
-		smbiod_thread = NULL;
-	} else {
-		smbiod_state = SMBIOD_RUNNING;
-		smbiod_thread = tsk;
-	}
-	return err;
-}
-
-/*
- * register a server & start smbiod if necessary
- */
-int smbiod_register_server(struct smb_sb_info *server)
-{
-	int ret;
-	spin_lock(&servers_lock);
-	list_add(&server->entry, &smb_servers);
-	VERBOSE("%p\n", server);
-	ret = smbiod_start();
-	spin_unlock(&servers_lock);
-	return ret;
-}
-
-/*
- * Unregister a server
- * Must be called with the server lock held.
- */
-void smbiod_unregister_server(struct smb_sb_info *server)
-{
-	spin_lock(&servers_lock);
-	list_del_init(&server->entry);
-	VERBOSE("%p\n", server);
-	spin_unlock(&servers_lock);
-
-	smbiod_wake_up();
-	smbiod_flush(server);
-}
-
-void smbiod_flush(struct smb_sb_info *server)
-{
-	struct list_head *tmp, *n;
-	struct smb_request *req;
-
-	list_for_each_safe(tmp, n, &server->xmitq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-	list_for_each_safe(tmp, n, &server->recvq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-}
-
-/*
- * Wake up smbmount and make it reconnect to the server.
- * This must be called with the server locked.
- *
- * FIXME: add smbconnect version to this
- */
-int smbiod_retry(struct smb_sb_info *server)
-{
-	struct list_head *head;
-	struct smb_request *req;
-	struct pid *pid = get_pid(server->conn_pid);
-	int result = 0;
-
-	VERBOSE("state: %d\n", server->state);
-	if (server->state == CONN_VALID || server->state == CONN_RETRYING)
-		goto out;
-
-	smb_invalidate_inodes(server);
-
-	/*
-	 * Some requests are meaningless after a retry, so we abort them.
-	 * One example are all requests using 'fileid' since the files are
-	 * closed on retry.
-	 */
-	head = server->xmitq.next;
-	while (head != &server->xmitq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-		head = head->next;
-
-		req->rq_bytes_sent = 0;
-		if (req->rq_flags & SMB_REQ_NORETRY) {
-			VERBOSE("aborting request %p on xmitq\n", req);
-			req->rq_errno = -EIO;
-			list_del_init(&req->rq_queue);
-			smb_rput(req);
-			wake_up_interruptible(&req->rq_wait);
-		}
-	}
-
-	/*
-	 * FIXME: test the code for retrying request we already sent
-	 */
-	head = server->recvq.next;
-	while (head != &server->recvq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-		head = head->next;
-#if 0
-		if (req->rq_flags & SMB_REQ_RETRY) {
-			/* must move the request to the xmitq */
-			VERBOSE("retrying request %p on recvq\n", req);
-			list_move(&req->rq_queue, &server->xmitq);
-			continue;
-		}
-#endif
-
-		VERBOSE("aborting request %p on recvq\n", req);
-		/* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-
-	smb_close_socket(server);
-
-	if (!pid) {
-		/* FIXME: this is fatal, umount? */
-		printk(KERN_ERR "smb_retry: no connection process\n");
-		server->state = CONN_RETRIED;
-		goto out;
-	}
-
-	/*
-	 * Change state so that only one retry per server will be started.
-	 */
-	server->state = CONN_RETRYING;
-
-	/*
-	 * Note: use the "priv" flag, as a user process may need to reconnect.
-	 */
-	result = kill_pid(pid, SIGUSR1, 1);
-	if (result) {
-		/* FIXME: this is most likely fatal, umount? */
-		printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
-		goto out;
-	}
-	VERBOSE("signalled pid %d\n", pid_nr(pid));
-
-	/* FIXME: The retried requests should perhaps get a "time boost". */
-
-out:
-	put_pid(pid);
-	return result;
-}
-
-/*
- * Currently handles lockingX packets.
- */
-static void smbiod_handle_request(struct smb_sb_info *server)
-{
-	PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
-	server->rstate = SMB_RECV_DROP;
-}
-
-/*
- * Do some IO for one server.
- */
-static void smbiod_doio(struct smb_sb_info *server)
-{
-	int result;
-	int maxwork = 7;
-
-	if (server->state != CONN_VALID)
-		goto out;
-
-	do {
-		result = smb_request_recv(server);
-		if (result < 0) {
-			server->state = CONN_INVALID;
-			smbiod_retry(server);
-			goto out;	/* reconnecting is slow */
-		} else if (server->rstate == SMB_RECV_REQUEST)
-			smbiod_handle_request(server);
-	} while (result > 0 && maxwork-- > 0);
-
-	/*
-	 * If there is more to read then we want to be sure to wake up again.
-	 */
-	if (server->state != CONN_VALID)
-		goto out;
-	if (smb_recv_available(server) > 0)
-		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-	do {
-		result = smb_request_send_server(server);
-		if (result < 0) {
-			server->state = CONN_INVALID;
-			smbiod_retry(server);
-			goto out;	/* reconnecting is slow */
-		}
-	} while (result > 0);
-
-	/*
-	 * If the last request was not sent out we want to wake up again.
-	 */
-	if (!list_empty(&server->xmitq))
-		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-out:
-	return;
-}
-
-/*
- * smbiod kernel thread
- */
-static int smbiod(void *unused)
-{
-	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-
-	for (;;) {
-		struct smb_sb_info *server;
-		struct list_head *pos, *n;
-
-		/* FIXME: Use poll? */
-		wait_event_interruptible(smbiod_wait,
-			 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
-		if (signal_pending(current)) {
-			spin_lock(&servers_lock);
-			smbiod_state = SMBIOD_DEAD;
-			spin_unlock(&servers_lock);
-			break;
-		}
-
-		clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-		spin_lock(&servers_lock);
-		if (list_empty(&smb_servers)) {
-			smbiod_state = SMBIOD_DEAD;
-			spin_unlock(&servers_lock);
-			break;
-		}
-
-		list_for_each_safe(pos, n, &smb_servers) {
-			server = list_entry(pos, struct smb_sb_info, entry);
-			VERBOSE("checking server %p\n", server);
-
-			if (server->state == CONN_VALID) {
-				spin_unlock(&servers_lock);
-
-				smb_lock_server(server);
-				smbiod_doio(server);
-				smb_unlock_server(server);
-
-				spin_lock(&servers_lock);
-			}
-		}
-		spin_unlock(&servers_lock);
-	}
-
-	VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
-	module_put_and_exit(0);
-}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd0..000000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- *  sock.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <net/scm.h>
-#include <net/tcp_states.h>
-#include <net/ip.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb.h>
-#include <linux/smbno.h>
-
-#include <asm/uaccess.h>
-#include <asm/ioctls.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-
-
-static int
-_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
-{
-	struct kvec iov = {ubuf, size};
-	struct msghdr msg = {.msg_flags = flags};
-	msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
-	return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
-}
-
-/*
- * Return the server this socket belongs to
- */
-static struct smb_sb_info *
-server_from_socket(struct socket *socket)
-{
-	return socket->sk->sk_user_data;
-}
-
-/*
- * Called when there is data on the socket.
- */
-void
-smb_data_ready(struct sock *sk, int len)
-{
-	struct smb_sb_info *server = server_from_socket(sk->sk_socket);
-	void (*data_ready)(struct sock *, int) = server->data_ready;
-
-	data_ready(sk, len);
-	VERBOSE("(%p, %d)\n", sk, len);
-	smbiod_wake_up();
-}
-
-int
-smb_valid_socket(struct inode * inode)
-{
-	return (inode && S_ISSOCK(inode->i_mode) && 
-		SOCKET_I(inode)->type == SOCK_STREAM);
-}
-
-static struct socket *
-server_sock(struct smb_sb_info *server)
-{
-	struct file *file;
-
-	if (server && (file = server->sock_file))
-	{
-#ifdef SMBFS_PARANOIA
-		if (!smb_valid_socket(file->f_path.dentry->d_inode))
-			PARANOIA("bad socket!\n");
-#endif
-		return SOCKET_I(file->f_path.dentry->d_inode);
-	}
-	return NULL;
-}
-
-void
-smb_close_socket(struct smb_sb_info *server)
-{
-	struct file * file = server->sock_file;
-
-	if (file) {
-		struct socket *sock = server_sock(server);
-
-		VERBOSE("closing socket %p\n", sock);
-		sock->sk->sk_data_ready = server->data_ready;
-		server->sock_file = NULL;
-		fput(file);
-	}
-}
-
-static int
-smb_get_length(struct socket *socket, unsigned char *header)
-{
-	int result;
-
-	result = _recvfrom(socket, header, 4, MSG_PEEK);
-	if (result == -EAGAIN)
-		return -ENODATA;
-	if (result < 0) {
-		PARANOIA("recv error = %d\n", -result);
-		return result;
-	}
-	if (result < 4)
-		return -ENODATA;
-
-	switch (header[0]) {
-	case 0x00:
-	case 0x82:
-		break;
-
-	case 0x85:
-		DEBUG1("Got SESSION KEEP ALIVE\n");
-		_recvfrom(socket, header, 4, 0);	/* read away */
-		return -ENODATA;
-
-	default:
-		PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
-		return -EIO;
-	}
-
-	/* The length in the RFC NB header is the raw data length */
-	return smb_len(header);
-}
-
-int
-smb_recv_available(struct smb_sb_info *server)
-{
-	mm_segment_t oldfs;
-	int avail, err;
-	struct socket *sock = server_sock(server);
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-	err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
-	set_fs(oldfs);
-	return (err >= 0) ? avail : err;
-}
-
-/*
- * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
- */
-static int
-smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
-{
-	struct kvec *iv = *data;
-	int i;
-	int len;
-
-	/*
-	 *	Eat any sent kvecs
-	 */
-	while (iv->iov_len <= amount) {
-		amount -= iv->iov_len;
-		iv++;
-		(*num)--;
-	}
-
-	/*
-	 *	And chew down the partial one
-	 */
-	vec[0].iov_len = iv->iov_len-amount;
-	vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
-	iv++;
-
-	len = vec[0].iov_len;
-
-	/*
-	 *	And copy any others
-	 */
-	for (i = 1; i < *num; i++) {
-		vec[i] = *iv++;
-		len += vec[i].iov_len;
-	}
-
-	*data = vec;
-	return len;
-}
-
-/*
- * smb_receive_header
- * Only called by the smbiod thread.
- */
-int
-smb_receive_header(struct smb_sb_info *server)
-{
-	struct socket *sock;
-	int result = 0;
-	unsigned char peek_buf[4];
-
-	result = -EIO; 
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	if (!server->smb_read) {
-		result = smb_get_length(sock, peek_buf);
-		if (result < 0) {
-			if (result == -ENODATA)
-				result = 0;
-			goto out;
-		}
-		server->smb_len = result + 4;
-
-		if (server->smb_len < SMB_HEADER_LEN) {
-			PARANOIA("short packet: %d\n", result);
-			server->rstate = SMB_RECV_DROP;
-			result = -EIO;
-			goto out;
-		}
-		if (server->smb_len > SMB_MAX_PACKET_SIZE) {
-			PARANOIA("long packet: %d\n", result);
-			server->rstate = SMB_RECV_DROP;
-			result = -EIO;
-			goto out;
-		}
-	}
-
-	result = _recvfrom(sock, server->header + server->smb_read,
-			   SMB_HEADER_LEN - server->smb_read, 0);
-	VERBOSE("_recvfrom: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	server->smb_read += result;
-
-	if (server->smb_read == SMB_HEADER_LEN)
-		server->rstate = SMB_RECV_HCOMPLETE;
-out:
-	return result;
-}
-
-static char drop_buffer[PAGE_SIZE];
-
-/*
- * smb_receive_drop - read and throw away the data
- * Only called by the smbiod thread.
- *
- * FIXME: we are in the kernel, could we just tell the socket that we want
- * to drop stuff from the buffer?
- */
-int
-smb_receive_drop(struct smb_sb_info *server)
-{
-	struct socket *sock;
-	unsigned int flags;
-	struct kvec iov;
-	struct msghdr msg;
-	int rlen = smb_len(server->header) - server->smb_read + 4;
-	int result = -EIO;
-
-	if (rlen > PAGE_SIZE)
-		rlen = PAGE_SIZE;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-	iov.iov_base = drop_buffer;
-	iov.iov_len = PAGE_SIZE;
-	msg.msg_flags = flags;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-
-	result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
-
-	VERBOSE("read: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	server->smb_read += result;
-
-	if (server->smb_read >= server->smb_len)
-		server->rstate = SMB_RECV_END;
-
-out:
-	return result;
-}
-
-/*
- * smb_receive
- * Only called by the smbiod thread.
- */
-int
-smb_receive(struct smb_sb_info *server, struct smb_request *req)
-{
-	struct socket *sock;
-	unsigned int flags;
-	struct kvec iov[4];
-	struct kvec *p = req->rq_iov;
-	size_t num = req->rq_iovlen;
-	struct msghdr msg;
-	int rlen;
-	int result = -EIO;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-	msg.msg_flags = flags;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-
-	/* Dont repeat bytes and count available bufferspace */
-	rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
-			(req->rq_rlen - req->rq_bytes_recvd));
-
-	result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
-
-	VERBOSE("read: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	req->rq_bytes_recvd += result;
-	server->smb_read += result;
-
-out:
-	return result;
-}
-
-/*
- * Try to send a SMB request. This may return after sending only parts of the
- * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
- *
- * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
- */
-int
-smb_send_request(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	struct socket *sock;
-	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
-        int slen = req->rq_slen - req->rq_bytes_sent;
-	int result = -EIO;
-	struct kvec iov[4];
-	struct kvec *p = req->rq_iov;
-	size_t num = req->rq_iovlen;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	/* Dont repeat bytes */
-	if (req->rq_bytes_sent)
-		smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
-
-	result = kernel_sendmsg(sock, &msg, p, num, slen);
-
-	if (result >= 0) {
-		req->rq_bytes_sent += result;
-		if (req->rq_bytes_sent >= req->rq_slen)
-			req->rq_flags |= SMB_REQ_TRANSMITTED;
-	}
-out:
-	return result;
-}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd469..000000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  symlink.c
- *
- *  Copyright (C) 2002 by John Newbigin
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <linux/namei.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
-{
-	DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
-
-	return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
-}
-
-static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *link = __getname();
-	DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
-
-	if (!link) {
-		link = ERR_PTR(-ENOMEM);
-	} else {
-		int len = smb_proc_read_link(server_from_dentry(dentry),
-						dentry, link, PATH_MAX - 1);
-		if (len < 0) {
-			__putname(link);
-			link = ERR_PTR(len);
-		} else {
-			link[len] = 0;
-		}
-	}
-	nd_set_link(nd, link);
-	return NULL;
-}
-
-static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		__putname(s);
-}
-
-const struct inode_operations smb_link_inode_operations =
-{
-	.readlink	= generic_readlink,
-	.follow_link	= smb_follow_link,
-	.put_link	= smb_put_link,
-};
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 626b629429ff..98d520d371ed 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -326,10 +326,6 @@ header-y += serio.h
 header-y += shm.h
 header-y += signal.h
 header-y += signalfd.h
-header-y += smb.h
-header-y += smb_fs.h
-header-y += smb_mount.h
-header-y += smbno.h
 header-y += snmp.h
 header-y += socket.h
 header-y += sockios.h
diff --git a/include/linux/smb.h b/include/linux/smb.h
deleted file mode 100644
index 82fefddc5987..000000000000
--- a/include/linux/smb.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  smb.h
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_H
-#define _LINUX_SMB_H
-
-#include <linux/types.h>
-#include <linux/magic.h>
-#ifdef __KERNEL__
-#include <linux/time.h>
-#endif
-
-enum smb_protocol { 
-	SMB_PROTOCOL_NONE, 
-	SMB_PROTOCOL_CORE, 
-	SMB_PROTOCOL_COREPLUS, 
-	SMB_PROTOCOL_LANMAN1, 
-	SMB_PROTOCOL_LANMAN2, 
-	SMB_PROTOCOL_NT1 
-};
-
-enum smb_case_hndl {
-	SMB_CASE_DEFAULT,
-	SMB_CASE_LOWER,
-	SMB_CASE_UPPER
-};
-
-struct smb_dskattr {
-        __u16 total;
-        __u16 allocblocks;
-        __u16 blocksize;
-        __u16 free;
-};
-
-struct smb_conn_opt {
-
-        /* The socket */
-	unsigned int fd;
-
-	enum smb_protocol protocol;
-	enum smb_case_hndl case_handling;
-
-	/* Connection-Options */
-
-	__u32              max_xmit;
-	__u16              server_uid;
-	__u16              tid;
-
-        /* The following are LANMAN 1.0 options */
-        __u16              secmode;
-        __u16              maxmux;
-        __u16              maxvcs;
-        __u16              rawmode;
-        __u32              sesskey;
-
-	/* The following are NT LM 0.12 options */
-	__u32              maxraw;
-	__u32              capabilities;
-	__s16              serverzone;
-};
-
-#ifdef __KERNEL__
-
-#define SMB_NLS_MAXNAMELEN 20
-struct smb_nls_codepage {
-	char local_name[SMB_NLS_MAXNAMELEN];
-	char remote_name[SMB_NLS_MAXNAMELEN];
-};
-
-
-#define SMB_MAXNAMELEN 255
-#define SMB_MAXPATHLEN 1024
-
-/*
- * Contains all relevant data on a SMB networked file.
- */
-struct smb_fattr {
-	__u16 attr;
-
-	unsigned long	f_ino;
-	umode_t		f_mode;
-	nlink_t		f_nlink;
-	uid_t		f_uid;
-	gid_t		f_gid;
-	dev_t		f_rdev;
-	loff_t		f_size;
-	struct timespec	f_atime;
-	struct timespec f_mtime;
-	struct timespec f_ctime;
-	unsigned long	f_blocks;
-	int		f_unix;
-};
-
-enum smb_conn_state {
-	CONN_VALID,		/* everything's fine */
-	CONN_INVALID,		/* Something went wrong, but did not
-				   try to reconnect yet. */
-	CONN_RETRIED,		/* Tried a reconnection, but was refused */
-	CONN_RETRYING		/* Currently trying to reconnect */
-};
-
-#define SMB_HEADER_LEN   37     /* includes everything up to, but not
-                                 * including smb_bcc */
-
-#define SMB_INITIAL_PACKET_SIZE		4000
-#define SMB_MAX_PACKET_SIZE		32768
-
-/* reserve this much space for trans2 parameters. Shouldn't have to be more
-   than 10 or so, but OS/2 seems happier like this. */
-#define SMB_TRANS2_MAX_PARAM 64
-
-#endif
-#endif
diff --git a/include/linux/smb_fs.h b/include/linux/smb_fs.h
deleted file mode 100644
index 923cd8a247b1..000000000000
--- a/include/linux/smb_fs.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *  smb_fs.h
- *
- *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_FS_H
-#define _LINUX_SMB_FS_H
-
-#include <linux/smb.h>
-
-/*
- * ioctl commands
- */
-#define	SMB_IOC_GETMOUNTUID		_IOR('u', 1, __kernel_old_uid_t)
-#define SMB_IOC_NEWCONN                 _IOW('u', 2, struct smb_conn_opt)
-
-/* __kernel_uid_t can never change, so we have to use __kernel_uid32_t */
-#define	SMB_IOC_GETMOUNTUID32		_IOR('u', 3, __kernel_uid32_t)
-
-
-#ifdef __KERNEL__
-#include <linux/smb_fs_i.h>
-#include <linux/smb_fs_sb.h>
-
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/vmalloc.h>
-#include <linux/smb_mount.h>
-#include <linux/jiffies.h>
-#include <asm/unaligned.h>
-
-static inline struct smb_sb_info *SMB_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct smb_inode_info *SMB_I(struct inode *inode)
-{
-	return container_of(inode, struct smb_inode_info, vfs_inode);
-}
-
-/* macro names are short for word, double-word, long value (?) */
-#define WVAL(buf, pos) (get_unaligned_le16((u8 *)(buf) + (pos)))
-#define DVAL(buf, pos) (get_unaligned_le32((u8 *)(buf) + (pos)))
-#define LVAL(buf, pos) (get_unaligned_le64((u8 *)(buf) + (pos)))
-
-#define WSET(buf, pos, val) put_unaligned_le16((val), (u8 *)(buf) + (pos))
-#define DSET(buf, pos, val) put_unaligned_le32((val), (u8 *)(buf) + (pos))
-#define LSET(buf, pos, val) put_unaligned_le64((val), (u8 *)(buf) + (pos))
-
-/* where to find the base of the SMB packet proper */
-#define smb_base(buf) ((u8 *)(((u8 *)(buf))+4))
-
-/*
- * Flags for the in-memory inode
- */
-#define SMB_F_LOCALWRITE	0x02	/* file modified locally */
-
-
-/* NT1 protocol capability bits */
-#define SMB_CAP_RAW_MODE         0x00000001
-#define SMB_CAP_MPX_MODE         0x00000002
-#define SMB_CAP_UNICODE          0x00000004
-#define SMB_CAP_LARGE_FILES      0x00000008
-#define SMB_CAP_NT_SMBS          0x00000010
-#define SMB_CAP_RPC_REMOTE_APIS  0x00000020
-#define SMB_CAP_STATUS32         0x00000040
-#define SMB_CAP_LEVEL_II_OPLOCKS 0x00000080
-#define SMB_CAP_LOCK_AND_READ    0x00000100
-#define SMB_CAP_NT_FIND          0x00000200
-#define SMB_CAP_DFS              0x00001000
-#define SMB_CAP_LARGE_READX      0x00004000
-#define SMB_CAP_LARGE_WRITEX     0x00008000
-#define SMB_CAP_UNIX             0x00800000	/* unofficial ... */
-
-
-/*
- * This is the time we allow an inode, dentry or dir cache to live. It is bad
- * for performance to have shorter ttl on an inode than on the cache. It can
- * cause refresh on each inode for a dir listing ... one-by-one
- */
-#define SMB_MAX_AGE(server) (((server)->mnt->ttl * HZ) / 1000)
-
-static inline void
-smb_age_dentry(struct smb_sb_info *server, struct dentry *dentry)
-{
-	dentry->d_time = jiffies - SMB_MAX_AGE(server);
-}
-
-struct smb_cache_head {
-	time_t		mtime;	/* unused */
-	unsigned long	time;	/* cache age */
-	unsigned long	end;	/* last valid fpos in cache */
-	int		eof;
-};
-
-#define SMB_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
-union smb_dir_cache {
-	struct smb_cache_head   head;
-	struct dentry           *dentry[SMB_DIRCACHE_SIZE];
-};
-
-#define SMB_FIRSTCACHE_SIZE	((int)((SMB_DIRCACHE_SIZE * \
-	sizeof(struct dentry *) - sizeof(struct smb_cache_head)) / \
-	sizeof(struct dentry *)))
-
-#define SMB_DIRCACHE_START      (SMB_DIRCACHE_SIZE - SMB_FIRSTCACHE_SIZE)
-
-struct smb_cache_control {
-	struct  smb_cache_head		head;
-	struct  page			*page;
-	union   smb_dir_cache		*cache;
-	unsigned long			fpos, ofs;
-	int				filled, valid, idx;
-};
-
-#define SMB_OPS_NUM_STATIC	5
-struct smb_ops {
-	int (*read)(struct inode *inode, loff_t offset, int count,
-		    char *data);
-	int (*write)(struct inode *inode, loff_t offset, int count, const
-		     char *data);
-	int (*readdir)(struct file *filp, void *dirent, filldir_t filldir,
-		       struct smb_cache_control *ctl);
-
-	int (*getattr)(struct smb_sb_info *server, struct dentry *dir,
-		       struct smb_fattr *fattr);
-	/* int (*setattr)(...); */      /* setattr is really icky! */
-
-	int (*truncate)(struct inode *inode, loff_t length);
-
-
-	/* --- --- --- end of "static" entries --- --- --- */
-
-	int (*convert)(unsigned char *output, int olen,
-		       const unsigned char *input, int ilen,
-		       struct nls_table *nls_from,
-		       struct nls_table *nls_to);
-};
-
-static inline int
-smb_is_open(struct inode *i)
-{
-	return (SMB_I(i)->open == server_from_inode(i)->generation);
-}
-
-extern void smb_install_null_ops(struct smb_ops *);
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_SMB_FS_H */
diff --git a/include/linux/smb_fs_i.h b/include/linux/smb_fs_i.h
deleted file mode 100644
index 8ccf4eca2c3d..000000000000
--- a/include/linux/smb_fs_i.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *  smb_fs_i.h
- *
- *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_FS_I
-#define _LINUX_SMB_FS_I
-
-#include <linux/types.h>
-#include <linux/fs.h>
-
-/*
- * smb fs inode data (in memory only)
- */
-struct smb_inode_info {
-
-	/*
-	 * file handles are local to a connection. A file is open if
-	 * (open == generation).
-	 */
-        unsigned int open;	/* open generation */
-	__u16 fileid;		/* What id to handle a file with? */
-	__u16 attr;		/* Attribute fields, DOS value */
-
-	__u16 access;		/* Access mode */
-	__u16 flags;
-	unsigned long oldmtime;	/* last time refreshed */
-	unsigned long closed;	/* timestamp when closed */
-	unsigned openers;	/* number of fileid users */
-
-	struct inode vfs_inode;	/* must be at the end */
-};
-
-#endif
diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h
deleted file mode 100644
index bb947dd1fba9..000000000000
--- a/include/linux/smb_fs_sb.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  smb_fs_sb.h
- *
- *  Copyright (C) 1995 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _SMB_FS_SB
-#define _SMB_FS_SB
-
-#include <linux/types.h>
-#include <linux/backing-dev.h>
-#include <linux/smb.h>
-
-/*
- * Upper limit on the total number of active smb_request structs.
- */
-#define MAX_REQUEST_HARD       256
-
-enum smb_receive_state {
-	SMB_RECV_START,		/* No data read, looking for length + sig */
-	SMB_RECV_HEADER,	/* Reading the header data */
-	SMB_RECV_HCOMPLETE,	/* Done with the header */
-	SMB_RECV_PARAM,		/* Reading parameter words */
-	SMB_RECV_DATA,		/* Reading data bytes */
-	SMB_RECV_END,		/* End of request */
-	SMB_RECV_DROP,		/* Dropping this SMB */
-	SMB_RECV_REQUEST,	/* Received a request and not a reply */
-};
-
-/* structure access macros */
-#define server_from_inode(inode) SMB_SB((inode)->i_sb)
-#define server_from_dentry(dentry) SMB_SB((dentry)->d_sb)
-#define SB_of(server) ((server)->super_block)
-
-struct smb_sb_info {
-	/* List of all smbfs superblocks */
-	struct list_head entry;
-
-        enum smb_conn_state state;
-	struct file * sock_file;
-	int conn_error;
-	enum smb_receive_state rstate;
-
-	atomic_t nr_requests;
-	struct list_head xmitq;
-	struct list_head recvq;
-	u16 mid;
-
-        struct smb_mount_data_kernel *mnt;
-
-	/* Connections are counted. Each time a new socket arrives,
-	 * generation is incremented.
-	 */
-	unsigned int generation;
-	struct pid *conn_pid;
-	struct smb_conn_opt opt;
-	wait_queue_head_t conn_wq;
-	int conn_complete;
-	struct semaphore sem;
-
-	unsigned char      header[SMB_HEADER_LEN + 20*2 + 2];
-	u32                header_len;
-	u32                smb_len;
-	u32                smb_read;
-
-        /* We use our own data_ready callback, but need the original one */
-        void *data_ready;
-
-	/* nls pointers for codepage conversions */
-	struct nls_table *remote_nls;
-	struct nls_table *local_nls;
-
-	struct smb_ops *ops;
-
-	struct super_block *super_block;
-
-	struct backing_dev_info bdi;
-};
-
-static inline int
-smb_lock_server_interruptible(struct smb_sb_info *server)
-{
-	return down_interruptible(&(server->sem));
-}
-
-static inline void
-smb_lock_server(struct smb_sb_info *server)
-{
-	down(&(server->sem));
-}
-
-static inline void
-smb_unlock_server(struct smb_sb_info *server)
-{
-	up(&(server->sem));
-}
-
-#endif
diff --git a/include/linux/smb_mount.h b/include/linux/smb_mount.h
deleted file mode 100644
index d10f00cb5703..000000000000
--- a/include/linux/smb_mount.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  smb_mount.h
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- */
-
-#ifndef _LINUX_SMB_MOUNT_H
-#define _LINUX_SMB_MOUNT_H
-
-#include <linux/types.h>
-
-#define SMB_MOUNT_VERSION	6
-
-struct smb_mount_data {
-	int version;
-	__kernel_uid_t mounted_uid; /* Who may umount() this filesystem? */
-	__kernel_uid_t uid;
-	__kernel_gid_t gid;
-	__kernel_mode_t file_mode;
-	__kernel_mode_t dir_mode;
-};
-
-
-#ifdef __KERNEL__
-
-/* "vers" in big-endian */
-#define SMB_MOUNT_ASCII 0x76657273
-
-#define SMB_MOUNT_OLDVERSION	6
-#undef SMB_MOUNT_VERSION
-#define SMB_MOUNT_VERSION	7
-
-/* flags */
-#define SMB_MOUNT_WIN95		0x0001	/* Win 95 server */
-#define SMB_MOUNT_OLDATTR	0x0002	/* Use core getattr (Win 95 speedup) */
-#define SMB_MOUNT_DIRATTR	0x0004	/* Use find_first for getattr */
-#define SMB_MOUNT_CASE		0x0008	/* Be case sensitive */
-#define SMB_MOUNT_UNICODE	0x0010	/* Server talks unicode */
-#define SMB_MOUNT_UID		0x0020  /* Use user specified uid */
-#define SMB_MOUNT_GID		0x0040  /* Use user specified gid */
-#define SMB_MOUNT_FMODE		0x0080  /* Use user specified file mode */
-#define SMB_MOUNT_DMODE		0x0100  /* Use user specified dir mode */
-
-struct smb_mount_data_kernel {
-	int version;
-
-	uid_t mounted_uid;	/* Who may umount() this filesystem? */
-	uid_t uid;
-	gid_t gid;
-	mode_t file_mode;
-	mode_t dir_mode;
-
-	u32 flags;
-
-        /* maximum age in jiffies (inode, dentry and dircache) */
-	int ttl;
-
-	struct smb_nls_codepage codepage;
-};
-
-#endif
-
-#endif
diff --git a/include/linux/smbno.h b/include/linux/smbno.h
deleted file mode 100644
index f99e02d9ffe2..000000000000
--- a/include/linux/smbno.h
+++ /dev/null
@@ -1,363 +0,0 @@
-#ifndef _SMBNO_H_
-#define _SMBNO_H_
-
-/* these define the attribute byte as seen by DOS */
-#define aRONLY	(1L<<0)
-#define aHIDDEN	(1L<<1)
-#define aSYSTEM	(1L<<2)
-#define aVOLID	(1L<<3)
-#define aDIR	(1L<<4)
-#define aARCH	(1L<<5)
-
-/* error classes */
-#define SUCCESS 0  /* The request was successful. */
-#define ERRDOS 0x01 /*  Error is from the core DOS operating system set. */
-#define ERRSRV 0x02 /* Error is generated by the server network file manager.*/
-#define ERRHRD 0x03  /* Error is an hardware error. */
-#define ERRCMD 0xFF  /* Command was not in the "SMB" format. */
-
-/* SMB X/Open error codes for the ERRdos error class */
-
-#define ERRbadfunc 1            /* Invalid function (or system call) */
-#define ERRbadfile 2            /* File not found (pathname error) */
-#define ERRbadpath 3            /* Directory not found */
-#define ERRnofids 4             /* Too many open files */
-#define ERRnoaccess 5           /* Access denied */
-#define ERRbadfid 6             /* Invalid fid */
-#define ERRbadmcb 7             /* Memory control blocks destroyed */
-#define ERRnomem 8              /* Out of memory */
-#define ERRbadmem 9             /* Invalid memory block address */
-#define ERRbadenv 10            /* Invalid environment */
-#define ERRbadformat 11         /* Invalid format */
-#define ERRbadaccess 12         /* Invalid open mode */
-#define ERRbaddata 13           /* Invalid data (only from ioctl call) */
-#define ERRres 14               /* reserved */
-#define ERRbaddrive 15          /* Invalid drive */
-#define ERRremcd 16             /* Attempt to delete current directory */
-#define ERRdiffdevice 17        /* rename/move across different filesystems */
-#define ERRnofiles 18           /* no more files found in file search */
-#define ERRbadshare 32          /* Share mode on file conflict with open mode */
-#define ERRlock 33              /* Lock request conflicts with existing lock */
-#define ERRfilexists 80         /* File in operation already exists */
-#define ERRbadpipe 230          /* Named pipe invalid */
-#define ERRpipebusy 231         /* All instances of pipe are busy */
-#define ERRpipeclosing 232      /* named pipe close in progress */
-#define ERRnotconnected 233     /* No process on other end of named pipe */
-#define ERRmoredata 234         /* More data to be returned */
-
-#define ERROR_INVALID_PARAMETER	 87
-#define ERROR_DISK_FULL		112
-#define ERROR_INVALID_NAME	123
-#define ERROR_DIR_NOT_EMPTY	145
-#define ERROR_NOT_LOCKED	158
-#define ERROR_ALREADY_EXISTS	183  /* see also 80 ? */
-#define ERROR_EAS_DIDNT_FIT	275 /* Extended attributes didn't fit */
-#define ERROR_EAS_NOT_SUPPORTED	282 /* Extended attributes not supported */
-
-/* Error codes for the ERRSRV class */
-
-#define ERRerror 1              /* Non specific error code */
-#define ERRbadpw 2              /* Bad password */
-#define ERRbadtype 3            /* reserved */
-#define ERRaccess 4          /* No permissions to do the requested operation */
-#define ERRinvnid 5             /* tid invalid */
-#define ERRinvnetname 6         /* Invalid servername */
-#define ERRinvdevice 7          /* Invalid device */
-#define ERRqfull 49             /* Print queue full */
-#define ERRqtoobig 50           /* Queued item too big */
-#define ERRinvpfid 52           /* Invalid print file in smb_fid */
-#define ERRsmbcmd 64            /* Unrecognised command */
-#define ERRsrverror 65          /* smb server internal error */
-#define ERRfilespecs 67         /* fid and pathname invalid combination */
-#define ERRbadlink 68           /* reserved */
-#define ERRbadpermits 69        /* Access specified for a file is not valid */
-#define ERRbadpid 70            /* reserved */
-#define ERRsetattrmode 71       /* attribute mode invalid */
-#define ERRpaused 81            /* Message server paused */
-#define ERRmsgoff 82            /* Not receiving messages */
-#define ERRnoroom 83            /* No room for message */
-#define ERRrmuns 87             /* too many remote usernames */
-#define ERRtimeout 88           /* operation timed out */
-#define ERRnoresource  89   /* No resources currently available for request. */
-#define ERRtoomanyuids 90       /* too many userids */
-#define ERRbaduid 91            /* bad userid */
-#define ERRuseMPX 250    /* temporarily unable to use raw mode, use MPX mode */
-#define ERRuseSTD 251    /* temporarily unable to use raw mode, use std.mode */
-#define ERRcontMPX 252          /* resume MPX mode */
-#define ERRbadPW                /* reserved */
-#define ERRnosupport 0xFFFF
-
-/* Error codes for the ERRHRD class */
-
-#define ERRnowrite 19           /* read only media */
-#define ERRbadunit 20           /* Unknown device */
-#define ERRnotready 21          /* Drive not ready */
-#define ERRbadcmd 22            /* Unknown command */
-#define ERRdata 23              /* Data (CRC) error */
-#define ERRbadreq 24            /* Bad request structure length */
-#define ERRseek 25
-#define ERRbadmedia 26
-#define ERRbadsector 27
-#define ERRnopaper 28
-#define ERRwrite 29             /* write fault */
-#define ERRread 30              /* read fault */
-#define ERRgeneral 31           /* General hardware failure */
-#define ERRwrongdisk 34
-#define ERRFCBunavail 35
-#define ERRsharebufexc 36       /* share buffer exceeded */
-#define ERRdiskfull 39
-
-/*
- * Access modes when opening a file
- */
-#define SMB_ACCMASK	0x0003
-#define SMB_O_RDONLY	0x0000
-#define SMB_O_WRONLY	0x0001
-#define SMB_O_RDWR	0x0002
-
-/* offsets into message for common items */
-#define smb_com 8
-#define smb_rcls 9
-#define smb_reh 10
-#define smb_err 11
-#define smb_flg 13
-#define smb_flg2 14
-#define smb_reb 13
-#define smb_tid 28
-#define smb_pid 30
-#define smb_uid 32
-#define smb_mid 34
-#define smb_wct 36
-#define smb_vwv 37
-#define smb_vwv0 37
-#define smb_vwv1 39
-#define smb_vwv2 41
-#define smb_vwv3 43
-#define smb_vwv4 45
-#define smb_vwv5 47
-#define smb_vwv6 49
-#define smb_vwv7 51
-#define smb_vwv8 53
-#define smb_vwv9 55
-#define smb_vwv10 57
-#define smb_vwv11 59
-#define smb_vwv12 61
-#define smb_vwv13 63
-#define smb_vwv14 65
-
-/* these are the trans2 sub fields for primary requests */
-#define smb_tpscnt smb_vwv0
-#define smb_tdscnt smb_vwv1
-#define smb_mprcnt smb_vwv2
-#define smb_mdrcnt smb_vwv3
-#define smb_msrcnt smb_vwv4
-#define smb_flags smb_vwv5
-#define smb_timeout smb_vwv6
-#define smb_pscnt smb_vwv9
-#define smb_psoff smb_vwv10
-#define smb_dscnt smb_vwv11
-#define smb_dsoff smb_vwv12
-#define smb_suwcnt smb_vwv13
-#define smb_setup smb_vwv14
-#define smb_setup0 smb_setup
-#define smb_setup1 (smb_setup+2)
-#define smb_setup2 (smb_setup+4)
-
-/* these are for the secondary requests */
-#define smb_spscnt smb_vwv2
-#define smb_spsoff smb_vwv3
-#define smb_spsdisp smb_vwv4
-#define smb_sdscnt smb_vwv5
-#define smb_sdsoff smb_vwv6
-#define smb_sdsdisp smb_vwv7
-#define smb_sfid smb_vwv8
-
-/* and these for responses */
-#define smb_tprcnt smb_vwv0
-#define smb_tdrcnt smb_vwv1
-#define smb_prcnt smb_vwv3
-#define smb_proff smb_vwv4
-#define smb_prdisp smb_vwv5
-#define smb_drcnt smb_vwv6
-#define smb_droff smb_vwv7
-#define smb_drdisp smb_vwv8
-
-/* the complete */
-#define SMBmkdir      0x00   /* create directory */
-#define SMBrmdir      0x01   /* delete directory */
-#define SMBopen       0x02   /* open file */
-#define SMBcreate     0x03   /* create file */
-#define SMBclose      0x04   /* close file */
-#define SMBflush      0x05   /* flush file */
-#define SMBunlink     0x06   /* delete file */
-#define SMBmv         0x07   /* rename file */
-#define SMBgetatr     0x08   /* get file attributes */
-#define SMBsetatr     0x09   /* set file attributes */
-#define SMBread       0x0A   /* read from file */
-#define SMBwrite      0x0B   /* write to file */
-#define SMBlock       0x0C   /* lock byte range */
-#define SMBunlock     0x0D   /* unlock byte range */
-#define SMBctemp      0x0E   /* create temporary file */
-#define SMBmknew      0x0F   /* make new file */
-#define SMBchkpth     0x10   /* check directory path */
-#define SMBexit       0x11   /* process exit */
-#define SMBlseek      0x12   /* seek */
-#define SMBtcon       0x70   /* tree connect */
-#define SMBtconX      0x75   /* tree connect and X*/
-#define SMBtdis       0x71   /* tree disconnect */
-#define SMBnegprot    0x72   /* negotiate protocol */
-#define SMBdskattr    0x80   /* get disk attributes */
-#define SMBsearch     0x81   /* search directory */
-#define SMBsplopen    0xC0   /* open print spool file */
-#define SMBsplwr      0xC1   /* write to print spool file */
-#define SMBsplclose   0xC2   /* close print spool file */
-#define SMBsplretq    0xC3   /* return print queue */
-#define SMBsends      0xD0   /* send single block message */
-#define SMBsendb      0xD1   /* send broadcast message */
-#define SMBfwdname    0xD2   /* forward user name */
-#define SMBcancelf    0xD3   /* cancel forward */
-#define SMBgetmac     0xD4   /* get machine name */
-#define SMBsendstrt   0xD5   /* send start of multi-block message */
-#define SMBsendend    0xD6   /* send end of multi-block message */
-#define SMBsendtxt    0xD7   /* send text of multi-block message */
-
-/* Core+ protocol */
-#define SMBlockread	  0x13   /* Lock a range and read */
-#define SMBwriteunlock 0x14 /* Unlock a range then write */
-#define SMBreadbraw   0x1a  /* read a block of data with no smb header */
-#define SMBwritebraw  0x1d  /* write a block of data with no smb header */
-#define SMBwritec     0x20  /* secondary write request */
-#define SMBwriteclose 0x2c  /* write a file then close it */
-
-/* dos extended protocol */
-#define SMBreadBraw      0x1A   /* read block raw */
-#define SMBreadBmpx      0x1B   /* read block multiplexed */
-#define SMBreadBs        0x1C   /* read block (secondary response) */
-#define SMBwriteBraw     0x1D   /* write block raw */
-#define SMBwriteBmpx     0x1E   /* write block multiplexed */
-#define SMBwriteBs       0x1F   /* write block (secondary request) */
-#define SMBwriteC        0x20   /* write complete response */
-#define SMBsetattrE      0x22   /* set file attributes expanded */
-#define SMBgetattrE      0x23   /* get file attributes expanded */
-#define SMBlockingX      0x24   /* lock/unlock byte ranges and X */
-#define SMBtrans         0x25   /* transaction - name, bytes in/out */
-#define SMBtranss        0x26   /* transaction (secondary request/response) */
-#define SMBioctl         0x27   /* IOCTL */
-#define SMBioctls        0x28   /* IOCTL  (secondary request/response) */
-#define SMBcopy          0x29   /* copy */
-#define SMBmove          0x2A   /* move */
-#define SMBecho          0x2B   /* echo */
-#define SMBopenX         0x2D   /* open and X */
-#define SMBreadX         0x2E   /* read and X */
-#define SMBwriteX        0x2F   /* write and X */
-#define SMBsesssetupX    0x73   /* Session Set Up & X (including User Logon) */
-#define SMBtconX         0x75   /* tree connect and X */
-#define SMBffirst        0x82   /* find first */
-#define SMBfunique       0x83   /* find unique */
-#define SMBfclose        0x84   /* find close */
-#define SMBinvalid       0xFE   /* invalid command */
-
-
-/* Extended 2.0 protocol */
-#define SMBtrans2        0x32   /* TRANS2 protocol set */
-#define SMBtranss2       0x33   /* TRANS2 protocol set, secondary command */
-#define SMBfindclose     0x34   /* Terminate a TRANSACT2_FINDFIRST */
-#define SMBfindnclose    0x35   /* Terminate a TRANSACT2_FINDNOTIFYFIRST */
-#define SMBulogoffX      0x74   /* user logoff */
-
-/* these are the TRANS2 sub commands */
-#define TRANSACT2_OPEN          0
-#define TRANSACT2_FINDFIRST     1
-#define TRANSACT2_FINDNEXT      2
-#define TRANSACT2_QFSINFO       3
-#define TRANSACT2_SETFSINFO     4
-#define TRANSACT2_QPATHINFO     5
-#define TRANSACT2_SETPATHINFO   6
-#define TRANSACT2_QFILEINFO     7
-#define TRANSACT2_SETFILEINFO   8
-#define TRANSACT2_FSCTL         9
-#define TRANSACT2_IOCTL           10
-#define TRANSACT2_FINDNOTIFYFIRST 11
-#define TRANSACT2_FINDNOTIFYNEXT  12
-#define TRANSACT2_MKDIR           13
-
-/* Information Levels -  Shared? */
-#define SMB_INFO_STANDARD		1
-#define SMB_INFO_QUERY_EA_SIZE		2
-#define SMB_INFO_QUERY_EAS_FROM_LIST	3
-#define SMB_INFO_QUERY_ALL_EAS		4
-#define SMB_INFO_IS_NAME_VALID		6
-
-/* Information Levels -  TRANSACT2_FINDFIRST */
-#define SMB_FIND_FILE_DIRECTORY_INFO		0x101
-#define SMB_FIND_FILE_FULL_DIRECTORY_INFO	0x102
-#define SMB_FIND_FILE_NAMES_INFO		0x103
-#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO	0x104
-
-/* Information Levels -  TRANSACT2_QPATHINFO */
-#define SMB_QUERY_FILE_BASIC_INFO	0x101
-#define SMB_QUERY_FILE_STANDARD_INFO	0x102
-#define SMB_QUERY_FILE_EA_INFO		0x103
-#define SMB_QUERY_FILE_NAME_INFO	0x104
-#define SMB_QUERY_FILE_ALL_INFO		0x107
-#define SMB_QUERY_FILE_ALT_NAME_INFO	0x108
-#define SMB_QUERY_FILE_STREAM_INFO	0x109
-#define SMB_QUERY_FILE_COMPRESSION_INFO	0x10b
-
-/* Information Levels - TRANSACT2_SETFILEINFO */
-#define SMB_SET_FILE_BASIC_INFO		0x101
-#define SMB_SET_FILE_DISPOSITION_INFO	0x102
-#define SMB_SET_FILE_ALLOCATION_INFO	0x103
-#define SMB_SET_FILE_END_OF_FILE_INFO	0x104
-
-/* smb_flg field flags */
-#define SMB_FLAGS_SUPPORT_LOCKREAD	0x01
-#define SMB_FLAGS_CLIENT_BUF_AVAIL	0x02
-#define SMB_FLAGS_RESERVED		0x04
-#define SMB_FLAGS_CASELESS_PATHNAMES	0x08
-#define SMB_FLAGS_CANONICAL_PATHNAMES	0x10
-#define SMB_FLAGS_REQUEST_OPLOCK	0x20
-#define SMB_FLAGS_REQUEST_BATCH_OPLOCK	0x40
-#define SMB_FLAGS_REPLY			0x80
-
-/* smb_flg2 field flags (samba-2.2.0/source/include/smb.h) */
-#define SMB_FLAGS2_LONG_PATH_COMPONENTS		0x0001
-#define SMB_FLAGS2_EXTENDED_ATTRIBUTES		0x0002
-#define SMB_FLAGS2_DFS_PATHNAMES		0x1000
-#define SMB_FLAGS2_READ_PERMIT_NO_EXECUTE	0x2000
-#define SMB_FLAGS2_32_BIT_ERROR_CODES		0x4000 
-#define SMB_FLAGS2_UNICODE_STRINGS		0x8000
-
-
-/*
- * UNIX stuff  (from samba trans2.h)
- */
-#define MIN_UNIX_INFO_LEVEL		0x200
-#define MAX_UNIX_INFO_LEVEL		0x2FF
-#define SMB_FIND_FILE_UNIX		0x202
-#define SMB_QUERY_FILE_UNIX_BASIC	0x200
-#define SMB_QUERY_FILE_UNIX_LINK	0x201
-#define SMB_QUERY_FILE_UNIX_HLINK	0x202
-#define SMB_SET_FILE_UNIX_BASIC		0x200
-#define SMB_SET_FILE_UNIX_LINK		0x201
-#define SMB_SET_FILE_UNIX_HLINK		0x203
-#define SMB_QUERY_CIFS_UNIX_INFO	0x200
-
-/* values which means "don't change it" */
-#define SMB_MODE_NO_CHANGE		0xFFFFFFFF
-#define SMB_UID_NO_CHANGE		0xFFFFFFFF
-#define SMB_GID_NO_CHANGE		0xFFFFFFFF
-#define SMB_TIME_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
-#define SMB_SIZE_NO_CHANGE		0xFFFFFFFFFFFFFFFFULL
-
-/* UNIX filetype mappings. */
-#define UNIX_TYPE_FILE		0
-#define UNIX_TYPE_DIR		1
-#define UNIX_TYPE_SYMLINK	2
-#define UNIX_TYPE_CHARDEV	3
-#define UNIX_TYPE_BLKDEV	4
-#define UNIX_TYPE_FIFO		5
-#define UNIX_TYPE_SOCKET	6
-#define UNIX_TYPE_UNKNOWN	0xFFFFFFFF
-
-#endif /* _SMBNO_H_ */
-- 
cgit v1.2.3


From 134669854e3a680d8aad9a4047891c653715f4c0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 6 Oct 2010 09:58:44 +0100
Subject: GFS2: Fix type mapping for demote_rq interface

Mostly the glock operations follow the type of the glock. The
one exception is the transaction glock, so we need to check for
that directly.

Reported-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c | 1 -
 fs/gfs2/sys.c   | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 621d80e8fb2a..0d149dcc04e5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -452,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
 	[LM_TYPE_META] = &gfs2_meta_glops,
 	[LM_TYPE_INODE] = &gfs2_inode_glops,
 	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
-	[LM_TYPE_NONDISK] = &gfs2_trans_glops,
 	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
 	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
 	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 64082a5feae1..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 
 	if (gltype > LM_TYPE_JOURNAL)
 		return -EINVAL;
-	glops = gfs2_glops_list[gltype];
+	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+		glops = &gfs2_trans_glops;
+	else
+		glops = gfs2_glops_list[gltype];
 	if (glops == NULL)
 		return -EINVAL;
 	if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
-- 
cgit v1.2.3


From 7ffec372458d163492e56e663a1b3a2d7be0a0a2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 29 Sep 2010 19:51:11 -0400
Subject: cifs: add refcounted and timestamped container for holding tcons

Eventually, we'll need to track the use of tcons on a per-sb basis, so that
we know when it's ok to tear them down. Begin this conversion by adding a
new "tcon_link" struct and accessors that get it. For now, the core data
structures are untouched -- cifs_sb still just points to a single tcon and
the pointers are just cast to deal with the accessor functions. A later
patch will flesh this out.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c |  21 ++++---
 fs/cifs/cifsacl.c      |  42 ++++++++++---
 fs/cifs/cifsglob.h     |  51 ++++++++++++----
 fs/cifs/dir.c          |  65 ++++++++++++++------
 fs/cifs/file.c         |   9 ++-
 fs/cifs/inode.c        | 157 ++++++++++++++++++++++++++++++++++++++-----------
 fs/cifs/link.c         |  87 ++++++++++++++++++---------
 fs/cifs/readdir.c      |  32 +++++-----
 fs/cifs/xattr.c        |  60 ++++++++++++-------
 9 files changed, 376 insertions(+), 148 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f1e13ea45a17..f4aab6f01174 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -306,6 +306,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	int xid, i;
 	int rc = 0;
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct tcon_link *tlink;
 
 	cFYI(1, "in %s", __func__);
 	BUG_ON(IS_ROOT(dentry));
@@ -315,14 +316,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dput(nd->path.dentry);
 	nd->path.dentry = dget(dentry);
 
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	ses = cifs_sb_tcon(cifs_sb)->ses;
-
-	if (!ses) {
-		rc = -EINVAL;
-		goto out_err;
-	}
-
 	/*
 	 * The MSDFS spec states that paths in DFS referral requests and
 	 * responses must be prefixed by a single '\' character instead of
@@ -335,10 +328,20 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 	}
 
-	rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls,
+	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto out_err;
+	}
+	ses = tlink_tcon(tlink)->ses;
+
+	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
 		&num_referrals, &referrals,
 		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+	cifs_put_tlink(tlink);
+
 	for (i = 0; i < num_referrals; i++) {
 		int len;
 		dump_referral(referrals+i);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 32f244909a0d..2647ea410c4c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -557,11 +557,16 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 {
 	struct cifs_ntsd *pntsd = NULL;
 	int xid, rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return NULL;
 
 	xid = GetXid();
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, &pntsd, pacllen);
+	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
 	FreeXid(xid);
 
+	cifs_put_tlink(tlink);
 
 	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 	return pntsd;
@@ -574,10 +579,16 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 	int oplock = 0;
 	int xid, rc;
 	__u16 fid;
+	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return NULL;
 
+	tcon = tlink_tcon(tlink);
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, cifs_sb_tcon(cifs_sb), path, FILE_OPEN, READ_CONTROL, 0,
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
@@ -585,11 +596,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 		goto out;
 	}
 
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, &pntsd, pacllen);
+	rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
 	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 
-	CIFSSMBClose(xid, cifs_sb_tcon(cifs_sb), fid);
+	CIFSSMBClose(xid, tcon, fid);
  out:
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return pntsd;
 }
@@ -616,10 +628,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
 		struct cifs_ntsd *pnntsd, u32 acllen)
 {
 	int xid, rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
 
 	xid = GetXid();
-	rc = CIFSSMBSetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 	return rc;
@@ -631,10 +648,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 	int oplock = 0;
 	int xid, rc;
 	__u16 fid;
+	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
 
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	tcon = tlink_tcon(tlink);
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, cifs_sb_tcon(cifs_sb), path, FILE_OPEN, WRITE_DAC, 0,
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
@@ -642,12 +665,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 		goto out;
 	}
 
-	rc = CIFSSMBSetCIFSACL(xid, cifs_sb_tcon(cifs_sb), fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 
-	CIFSSMBClose(xid, cifs_sb_tcon(cifs_sb), fid);
- out:
+	CIFSSMBClose(xid, tcon, fid);
+out:
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c265ebdcd177..cdfd2db4e70d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -310,6 +310,44 @@ struct cifsTconInfo {
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
+/*
+ * This is a refcounted and timestamped container for a tcon pointer. The
+ * container holds a tcon reference. It is considered safe to free one of
+ * these when the tl_count goes to 0. The tl_time is the time of the last
+ * "get" on the container.
+ */
+struct tcon_link {
+	spinlock_t		tl_lock;
+	u32			tl_count;
+	u64			tl_time;
+	struct cifsTconInfo	*tl_tcon;
+};
+
+static inline struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+	return (struct tcon_link *)cifs_sb->ptcon;
+}
+
+static inline struct cifsTconInfo *
+tlink_tcon(struct tcon_link *tlink)
+{
+	return (struct cifsTconInfo *)tlink;
+}
+
+static inline void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+	return;
+}
+
+/* This function is always expected to succeed */
+static inline struct cifsTconInfo *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return cifs_sb->ptcon;
+}
+
 /*
  * This info hangs off the cifsFileInfo structure, pointed to by llist.
  * This is used to track byte stream locks on the file
@@ -413,19 +451,6 @@ CIFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-static inline struct cifsTconInfo *
-cifs_sb_tcon(struct cifs_sb_info *cifs_sb)
-{
-	return cifs_sb->ptcon;
-}
-
-/* This function is always expected to succeed */
-static inline struct cifsTconInfo *
-cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
-{
-	return cifs_sb->ptcon;
-}
-
 static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 23ec28a4c11f..bb3ea06ca6f4 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -137,7 +137,6 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 {
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
 
 	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
 	if (pCifsFile == NULL)
@@ -191,7 +190,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	__u32 posix_flags = 0;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifs_fattr fattr;
-	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 
 	cFYI(1, "posix open %s", full_path);
 
@@ -226,10 +226,20 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		posix_flags |= SMB_O_DIRECT;
 
 	mode &= ~current_umask();
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto posix_open_ret;
+	}
+
+	tcon = tlink_tcon(tlink);
 	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
 			     poplock, full_path, cifs_sb->local_nls,
 			     cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+
 	if (rc)
 		goto posix_open_ret;
 
@@ -290,6 +300,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *tcon;
 	char *full_path = NULL;
 	FILE_ALL_INFO *buf = NULL;
@@ -299,13 +310,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb_tcon(cifs_sb);
-
-	full_path = build_path_from_dentry(direntry);
-	if (full_path == NULL) {
-		rc = -ENOMEM;
-		goto cifs_create_out;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return PTR_ERR(tlink);
 	}
+	tcon = tlink_tcon(tlink);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -315,6 +325,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	else
 		oflags = FMODE_READ | SMB_O_CREAT;
 
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto cifs_create_out;
+	}
+
 	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -481,6 +497,7 @@ cifs_create_set_dentry:
 cifs_create_out:
 	kfree(buf);
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return rc;
 }
@@ -491,6 +508,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	int rc = -EPERM;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
@@ -503,10 +521,14 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	if (!old_valid_dev(device_number))
 		return -EINVAL;
 
-	xid = GetXid();
-
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -606,6 +628,7 @@ mknod_out:
 	kfree(full_path);
 	kfree(buf);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -619,6 +642,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	__u16 fileHandle = 0;
 	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct cifsFileInfo *cfile;
 	struct inode *newInode = NULL;
@@ -633,7 +657,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/* check whether path exists */
 
 	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return (struct dentry *)tlink;
+	}
+	pTcon = tlink_tcon(tlink);
 
 	/*
 	 * Don't allow the separator character in a path component.
@@ -644,8 +673,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		for (i = 0; i < direntry->d_name.len; i++)
 			if (direntry->d_name.name[i] == '\\') {
 				cFYI(1, "Invalid file name");
-				FreeXid(xid);
-				return ERR_PTR(-EINVAL);
+				rc = -EINVAL;
+				goto lookup_out;
 			}
 	}
 
@@ -655,7 +684,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	 */
 	if (nd && (nd->flags & LOOKUP_EXCL)) {
 		d_instantiate(direntry, NULL);
-		return NULL;
+		rc = 0;
+		goto lookup_out;
 	}
 
 	/* can not grab the rename sem here since it would
@@ -663,8 +693,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	in which we already have the sb rename sem */
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
-		FreeXid(xid);
-		return ERR_PTR(-ENOMEM);
+		rc = -ENOMEM;
+		goto lookup_out;
 	}
 
 	if (direntry->d_inode != NULL) {
@@ -760,6 +790,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 lookup_out:
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return ERR_PTR(rc);
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de046e183d12..1e375abc5eb3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -224,6 +224,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink;
 	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	char *full_path = NULL;
@@ -235,7 +236,12 @@ int cifs_open(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return PTR_ERR(tlink);
+	}
+	tcon = tlink_tcon(tlink);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 
@@ -402,6 +408,7 @@ out:
 	kfree(buf);
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index da716d96dae6..aa229692aef1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -313,16 +313,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	FILE_UNIX_BASIC_INFO find_data;
 	struct cifs_fattr fattr;
 	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	tcon = cifs_sb_tcon(cifs_sb);
-
 	cFYI(1, "Getting info on %s", full_path);
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
 	/* could have done a find first instead but this returns more info */
 	rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
 				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
 
 	if (!rc) {
 		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -361,7 +366,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 	int rc;
 	int oplock = 0;
 	__u16 netfid;
-	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 	char buf[24];
 	unsigned int bytes_read;
 	char *pbuf;
@@ -380,7 +386,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 		return -EINVAL;	 /* EOPNOTSUPP? */
 	}
 
-	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags &
@@ -388,7 +399,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 	if (rc == 0) {
 		int buf_type = CIFS_NO_BUFFER;
 			/* Read header */
-		rc = CIFSSMBRead(xid, pTcon, netfid,
+		rc = CIFSSMBRead(xid, tcon, netfid,
 				 24 /* length */, 0 /* offset */,
 				 &bytes_read, &pbuf, &buf_type);
 		if ((rc == 0) && (bytes_read >= 8)) {
@@ -430,8 +441,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 			fattr->cf_dtype = DT_REG;
 			rc = -EOPNOTSUPP; /* or some unknown SFU type */
 		}
-		CIFSSMBClose(xid, pTcon, netfid);
+		CIFSSMBClose(xid, tcon, netfid);
 	}
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -449,11 +461,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	ssize_t rc;
 	char ea_value[4];
 	__u32 mode;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 
-	rc = CIFSSMBQAllEAs(xid, cifs_sb_tcon(cifs_sb), path, "SETFILEBITS",
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
 			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
 			    cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
 	if (rc < 0)
 		return (int)rc;
 	else if (rc > 3) {
@@ -564,26 +584,33 @@ int cifs_get_inode_info(struct inode **pinode,
 {
 	int rc = 0, tmprc;
 	struct cifsTconInfo *pTcon;
+	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	char *buf = NULL;
 	bool adjustTZ = false;
 	struct cifs_fattr fattr;
 
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	cFYI(1, "Getting info on %s", full_path);
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
 		if (CIFS_I(*pinode)->clientCanCacheRead) {
 			cFYI(1, "No need to revalidate cached inode sizes");
-			return rc;
+			goto cgii_exit;
 		}
 	}
 
 	/* if file info not passed in then get it from server */
 	if (pfindData == NULL) {
 		buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-		if (buf == NULL)
-			return -ENOMEM;
+		if (buf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 		pfindData = (FILE_ALL_INFO *)buf;
 
 		/* could do find first instead but this returns more info */
@@ -688,6 +715,7 @@ int cifs_get_inode_info(struct inode **pinode,
 
 cgii_exit:
 	kfree(buf);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -895,6 +923,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
 	struct cifsTconInfo *pTcon;
 	FILE_BASIC_INFO	info_buf;
 
@@ -942,7 +971,13 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 		goto set_via_filehandle;
 	}
 
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	pTcon = tlink_tcon(tlink);
 
 	/*
 	 * NT4 apparently returns success on this call, but it doesn't
@@ -987,6 +1022,8 @@ set_via_filehandle:
 	else
 		cifsFileInfo_put(open_file);
 out:
+	if (tlink != NULL)
+		cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1004,10 +1041,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 	__u32 dosattr, origattr;
 	FILE_BASIC_INFO *info_buf = NULL;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
 			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1076,6 +1119,7 @@ out_close:
 	CIFSSMBClose(xid, tcon, netfid);
 out:
 	kfree(info_buf);
+	cifs_put_tlink(tlink);
 	return rc;
 
 	/*
@@ -1115,12 +1159,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct cifsInodeInfo *cifs_inode;
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
 	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
 	xid = GetXid();
 
 	/* Unlink can be called from rename so we can not take the
@@ -1128,8 +1178,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto unlink_out;
 	}
 
 	if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1195,10 +1244,11 @@ out_reval:
 	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
 	cifs_inode = CIFS_I(dir);
 	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
-
+unlink_out:
 	kfree(full_path);
 	kfree(attrs);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1207,6 +1257,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	int rc = 0, tmprc;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
@@ -1214,16 +1265,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 
 	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
 
-	xid = GetXid();
-
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto mkdir_out;
 	}
 
 	if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1381,6 +1434,7 @@ mkdir_get_info:
 mkdir_out:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1389,6 +1443,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	int rc = 0;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
@@ -1397,18 +1452,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
-
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto rmdir_exit;
+	}
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto rmdir_exit;
 	}
+	pTcon = tlink_tcon(tlink);
 
 	rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
 
 	if (!rc) {
 		drop_nlink(inode);
@@ -1429,6 +1489,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
 		current_fs_time(inode->i_sb);
 
+rmdir_exit:
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1439,10 +1500,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 		struct dentry *to_dentry, const char *toPath)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
-	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink;
+	struct cifsTconInfo *pTcon;
 	__u16 srcfid;
 	int oplock, rc;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	/* try path-based rename first */
 	rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
 			   cifs_sb->mnt_cifs_flags &
@@ -1454,11 +1521,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 	 * rename by filehandle to various Windows servers.
 	 */
 	if (rc == 0 || rc != -ETXTBSY)
-		return rc;
+		goto do_rename_exit;
 
 	/* open-file renames don't work across directories */
 	if (to_dentry->d_parent != from_dentry->d_parent)
-		return rc;
+		goto do_rename_exit;
 
 	/* open the file to be renamed -- we need DELETE perms */
 	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1474,7 +1541,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 
 		CIFSSMBClose(xid, pTcon, srcfid);
 	}
-
+do_rename_exit:
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1484,13 +1552,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	char *fromName = NULL;
 	char *toName = NULL;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *tcon;
 	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
 	FILE_UNIX_BASIC_INFO *info_buf_target;
 	int xid, rc, tmprc;
 
 	cifs_sb = CIFS_SB(source_dir->i_sb);
-	tcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
 
 	xid = GetXid();
 
@@ -1566,6 +1638,7 @@ cifs_rename_exit:
 	kfree(fromName);
 	kfree(toName);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1728,6 +1801,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
 	struct cifsTconInfo *pTcon = NULL;
 
 	/*
@@ -1758,8 +1832,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		rc = -EINVAL;
 
 	if (rc != 0) {
-		if (pTcon == NULL)
-			pTcon = cifs_sb_tcon(cifs_sb);
+		if (pTcon == NULL) {
+			tlink = cifs_sb_tlink(cifs_sb);
+			if (IS_ERR(tlink))
+				return PTR_ERR(tlink);
+			pTcon = tlink_tcon(tlink);
+		}
 
 		/* Set file size by pathname rather than by handle
 		   either because no valid, writeable file handle for
@@ -1790,6 +1868,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 				CIFSSMBClose(xid, pTcon, netfid);
 			}
 		}
+		if (tlink)
+			cifs_put_tlink(tlink);
 	}
 
 	if (rc == 0) {
@@ -1810,6 +1890,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	struct inode *inode = direntry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
@@ -1905,11 +1986,17 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
 		cifsFileInfo_put(open_file);
 	} else {
-		pTcon = cifs_sb_tcon(cifs_sb);
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink)) {
+			rc = PTR_ERR(tlink);
+			goto out;
+		}
+		pTcon = tlink_tcon(tlink);
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
 				    cifs_sb->local_nls,
 				    cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cifs_put_tlink(tlink);
 	}
 
 	if (rc)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 66db2d61fa43..b38fe6704ad2 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -249,7 +249,8 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 	int rc;
 	int oplock = 0;
 	__u16 netfid = 0;
-	struct cifsTconInfo *pTcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink;
+	struct cifsTconInfo *pTcon;
 	u8 *buf;
 	char *pbuf;
 	unsigned int bytes_read = 0;
@@ -261,23 +262,30 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 		/* it's not a symlink */
 		return 0;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
 			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
 			 cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0)
-		return rc;
+		goto out;
 
 	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
 		CIFSSMBClose(xid, pTcon, netfid);
 		/* it's not a symlink */
-		return 0;
+		goto out;
 	}
 
 	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
 	pbuf = buf;
 
 	rc = CIFSSMBRead(xid, pTcon, netfid,
@@ -287,23 +295,28 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 	CIFSSMBClose(xid, pTcon, netfid);
 	if (rc != 0) {
 		kfree(buf);
-		return rc;
+		goto out;
 	}
 
 	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
 	kfree(buf);
-	if (rc == -EINVAL)
+	if (rc == -EINVAL) {
 		/* it's not a symlink */
-		return 0;
+		rc = 0;
+		goto out;
+	}
+
 	if (rc != 0)
-		return rc;
+		goto out;
 
 	/* it is a symlink */
 	fattr->cf_eof = link_len;
 	fattr->cf_mode &= ~S_IFMT;
 	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
 	fattr->cf_dtype = DT_LNK;
-	return 0;
+out:
+	cifs_put_tlink(tlink);
+	return rc;
 }
 
 int
@@ -314,17 +327,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	int xid;
 	char *fromName = NULL;
 	char *toName = NULL;
-	struct cifs_sb_info *cifs_sb_target;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct cifsInodeInfo *cifsInode;
 
-	xid = GetXid();
-
-	cifs_sb_target = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_tcon(cifs_sb_target);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
 
-/* No need to check for cross device links since server will do that
-   BB note DFS case in future though (when we may have to check) */
+	xid = GetXid();
 
 	fromName = build_path_from_dentry(old_file);
 	toName = build_path_from_dentry(direntry);
@@ -336,13 +349,13 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 /*	if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
 	if (pTcon->unix_ext)
 		rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
-					    cifs_sb_target->local_nls,
-					    cifs_sb_target->mnt_cifs_flags &
+					    cifs_sb->local_nls,
+					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else {
 		rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
-					cifs_sb_target->local_nls,
-					cifs_sb_target->mnt_cifs_flags &
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if ((rc == -EIO) || (rc == -EINVAL))
 			rc = -EOPNOTSUPP;
@@ -378,6 +391,7 @@ cifs_hl_exit:
 	kfree(fromName);
 	kfree(toName);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -390,10 +404,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	char *full_path = NULL;
 	char *target_path = NULL;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb_tcon(cifs_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifsTconInfo *tcon;
 
 	xid = GetXid();
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	tcon = tlink_tcon(tlink);
+
 	/*
 	 * For now, we just handle symlinks with unix extensions enabled.
 	 * Eventually we should handle NTFS reparse points, and MacOS
@@ -442,6 +465,8 @@ out:
 	}
 
 	FreeXid(xid);
+	if (tlink)
+		cifs_put_tlink(tlink);
 	nd_set_link(nd, target_path);
 	return NULL;
 }
@@ -451,22 +476,25 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 {
 	int rc = -EOPNOTSUPP;
 	int xid;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto symlink_exit;
+	}
+	pTcon = tlink_tcon(tlink);
 
 	full_path = build_path_from_dentry(direntry);
-
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto symlink_exit;
 	}
 
 	cFYI(1, "Full path: %s", full_path);
@@ -504,8 +532,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 			d_instantiate(direntry, newinode);
 		}
 	}
-
+symlink_exit:
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return rc;
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 887a7e230376..170047cf4522 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -223,33 +223,35 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 static int initiate_cifs_search(const int xid, struct file *file)
 {
 	int rc = 0;
-	char *full_path;
+	char *full_path = NULL;
 	struct cifsFileInfo *cifsFile;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EINVAL;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
 
 	if (file->private_data == NULL)
 		file->private_data =
 			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (file->private_data == NULL) {
+		rc = -ENOMEM;
+		goto error_exit;
+	}
 
-	if (file->private_data == NULL)
-		return -ENOMEM;
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = true;
 	cifsFile->srch_inf.endOfSearch = false;
-	cifsFile->tcon = cifs_sb_tcon(cifs_sb);
-	pTcon = cifsFile->tcon;
-	if (pTcon == NULL)
-		return -EINVAL;
+	cifsFile->tcon = pTcon;
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
-
-	if (full_path == NULL)
-		return -ENOMEM;
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto error_exit;
+	}
 
 	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 
@@ -282,7 +284,9 @@ ffirst_retry:
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
 		goto ffirst_retry;
 	}
+error_exit:
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 41f95bf67977..a264b744bb41 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -47,9 +47,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
-	char *full_path;
+	char *full_path = NULL;
 
 	if (direntry == NULL)
 		return -EIO;
@@ -58,16 +59,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 	sb = direntry->d_inode->i_sb;
 	if (sb == NULL)
 		return -EIO;
-	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto remove_ea_exit;
 	}
 	if (ea_name == NULL) {
 		cFYI(1, "Null xattr names not supported");
@@ -91,6 +95,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 remove_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
@@ -102,6 +107,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
@@ -113,16 +119,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 	sb = direntry->d_inode->i_sb;
 	if (sb == NULL)
 		return -EIO;
-	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto set_ea_exit;
 	}
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
@@ -132,9 +141,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		returns as xattrs */
 	if (value_size > MAX_EA_VALUE_SIZE) {
 		cFYI(1, "size of EA value too large");
-		kfree(full_path);
-		FreeXid(xid);
-		return -EOPNOTSUPP;
+		rc = -EOPNOTSUPP;
+		goto set_ea_exit;
 	}
 
 	if (ea_name == NULL) {
@@ -198,6 +206,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 set_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
@@ -209,6 +218,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
@@ -221,16 +231,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	if (sb == NULL)
 		return -EIO;
 
-	xid = GetXid();
-
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto get_ea_exit;
 	}
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
@@ -323,6 +335,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 get_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
@@ -333,6 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
@@ -346,18 +360,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 		return -EIO;
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb_tcon(cifs_sb);
-
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 		return -EOPNOTSUPP;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto list_ea_exit;
 	}
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
@@ -370,8 +386,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+list_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
-- 
cgit v1.2.3


From 13cfb7334eb6fd0fc06da5589aea1e947791f1d6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 29 Sep 2010 19:51:11 -0400
Subject: cifs: have cifsFileInfo hold a reference to a tlink rather than tcon
 pointer

cifsFileInfo needs a pointer to a tcon, but it doesn't currently hold a
reference to it. Change it to keep a pointer to a tcon_link instead and
hold a reference to it.

That will keep the tcon from being freed until the file is closed.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    |  3 ++-
 fs/cifs/cifsglob.h  |  9 ++++++++-
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/dir.c       | 10 +++++-----
 fs/cifs/file.c      | 31 ++++++++++++++++---------------
 fs/cifs/inode.c     | 12 ++++++------
 fs/cifs/ioctl.c     |  2 +-
 fs/cifs/link.c      |  1 -
 fs/cifs/readdir.c   |  4 ++--
 9 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 898d2a5cfad2..b2fd075dc2e6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -595,7 +595,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 	    ((arg == F_WRLCK) &&
 		(CIFS_I(inode)->clientCanCacheAll)))
 		return generic_setlease(file, arg, lease);
-	else if (cfile->tcon->local_lease && !CIFS_I(inode)->clientCanCacheRead)
+	else if (tlink_tcon(cfile->tlink)->local_lease &&
+		 !CIFS_I(inode)->clientCanCacheRead)
 		/* If the server claims to support oplock on this
 		   file, then we still need to check oplock even
 		   if the local_lease mount option is set, but there
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cdfd2db4e70d..d5324853203b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -341,6 +341,12 @@ cifs_put_tlink(struct tcon_link *tlink)
 	return;
 }
 
+static inline struct tcon_link *
+cifs_get_tlink(struct tcon_link *tlink)
+{
+	return tlink;
+}
+
 /* This function is always expected to succeed */
 static inline struct cifsTconInfo *
 cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
@@ -389,7 +395,7 @@ struct cifsFileInfo {
 	struct file *pfile; /* needed for writepage */
 	struct inode *pInode; /* needed for oplock break */
 	struct vfsmount *mnt;
-	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink;
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
 	bool closePend:1;	/* file is marked to close */
@@ -411,6 +417,7 @@ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
 	if (atomic_dec_and_test(&cifs_file->count)) {
+		cifs_put_tlink(cifs_file->tlink);
 		iput(cifs_file->pInode);
 		kfree(cifs_file);
 	}
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 610b2263c9f3..7294723d0625 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -107,7 +107,7 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 
 extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
-				struct vfsmount *mnt, struct cifsTconInfo *tcon,
+				struct vfsmount *mnt, struct tcon_link *tlink,
 				unsigned int oflags, __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index bb3ea06ca6f4..5adf47f28fed 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -132,7 +132,7 @@ cifs_bp_rename_retry:
 
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
-		  struct vfsmount *mnt, struct cifsTconInfo *tcon,
+		  struct vfsmount *mnt, struct tcon_link *tlink,
 		  unsigned int oflags, __u32 oplock)
 {
 	struct cifsFileInfo *pCifsFile;
@@ -149,7 +149,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
-	pCifsFile->tcon = tcon;
+	pCifsFile->tlink = cifs_get_tlink(tlink);
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
@@ -157,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
 
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &tcon->openFileList);
+	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
 	pCifsInode = CIFS_I(newinode);
 	if (pCifsInode) {
 		/* if readable file instance put first in list*/
@@ -483,7 +483,7 @@ cifs_create_set_dentry:
 		}
 
 		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-					       nd->path.mnt, tcon, oflags,
+						nd->path.mnt, tlink, oflags,
 						oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
@@ -758,7 +758,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 			}
 
 			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
-						  nd->path.mnt, pTcon,
+						  nd->path.mnt, tlink,
 						  nd->intent.open.flags,
 						  oplock);
 			if (cfile == NULL) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1e375abc5eb3..24332d437150 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -283,7 +283,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
 							file->f_path.mnt,
-							tcon, oflags, oplock);
+							tlink, oflags, oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
 				rc = -ENOMEM;
@@ -376,7 +376,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 
 	pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
-					tcon, file->f_flags, oplock);
+					tlink, file->f_flags, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
@@ -468,7 +468,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = pCifsFile->tcon;
+	tcon = tlink_tcon(pCifsFile->tlink);
 
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
@@ -582,7 +582,7 @@ int cifs_close(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = pSMBFile->tcon;
+	pTcon = tlink_tcon(pSMBFile->tlink);
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
 		write_lock(&GlobalSMBSeslock);
@@ -660,7 +660,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	if (pCFileStruct) {
-		struct cifsTconInfo *pTcon = pCFileStruct->tcon;
+		struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
 
 		cFYI(1, "Freeing private data in close dir");
 		write_lock(&GlobalSMBSeslock);
@@ -684,6 +684,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 			else
 				cifs_buf_release(ptmp);
 		}
+		cifs_put_tlink(pCFileStruct->tlink);
 		kfree(file->private_data);
 		file->private_data = NULL;
 	}
@@ -770,7 +771,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 		cFYI(1, "Unknown type of lock");
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	tcon = ((struct cifsFileInfo *)file->private_data)->tcon;
+	tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
@@ -970,7 +971,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 		return -EBADF;
 
 	open_file = file->private_data;
-	pTcon = open_file->tcon;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	rc = generic_write_checks(file, poffset, &write_size, 0);
 	if (rc)
@@ -1071,7 +1072,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	if (file->private_data == NULL)
 		return -EBADF;
 	open_file = file->private_data;
-	pTcon = open_file->tcon;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	xid = GetXid();
 
@@ -1393,7 +1394,7 @@ static int cifs_writepages(struct address_space *mapping,
 		return generic_writepages(mapping, wbc);
 	}
 
-	tcon = open_file->tcon;
+	tcon = tlink_tcon(open_file->tlink);
 	if (!experimEnabled && tcon->ses->server->secMode &
 			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
 		cifsFileInfo_put(open_file);
@@ -1672,7 +1673,7 @@ int cifs_fsync(struct file *file, int datasync)
 	if (rc == 0) {
 		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
-		tcon = smbfile->tcon;
+		tcon = tlink_tcon(smbfile->tlink);
 		if (!rc && tcon && smbfile &&
 		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
 			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
@@ -1764,7 +1765,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 		return rc;
 	}
 	open_file = file->private_data;
-	pTcon = open_file->tcon;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
@@ -1845,7 +1846,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		return rc;
 	}
 	open_file = file->private_data;
-	pTcon = open_file->tcon;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
@@ -1981,7 +1982,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	}
 	open_file = file->private_data;
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = open_file->tcon;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	/*
 	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2345,8 +2346,8 @@ void cifs_oplock_break(struct work_struct *work)
 	 * disconnected since oplock already released by the server
 	 */
 	if (!cfile->closePend && !cfile->oplock_break_cancelled) {
-		rc = CIFSSMBLock(0, cfile->tcon, cfile->netfid, 0, 0, 0, 0,
-				 LOCKING_ANDX_OPLOCK_RELEASE, false);
+		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
+				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
 		cFYI(1, "Oplock release rc = %d", rc);
 	}
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aa229692aef1..a39a1c451733 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -289,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = filp->private_data;
-	struct cifsTconInfo *tcon = cfile->tcon;
+	struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
 
 	xid = GetXid();
 	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -546,7 +546,7 @@ int cifs_get_file_info(struct file *filp)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = filp->private_data;
-	struct cifsTconInfo *tcon = cfile->tcon;
+	struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
 
 	xid = GetXid();
 	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -967,7 +967,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	if (open_file) {
 		netfid = open_file->netfid;
 		netpid = open_file->pid;
-		pTcon = open_file->tcon;
+		pTcon = tlink_tcon(open_file->tlink);
 		goto set_via_filehandle;
 	}
 
@@ -1696,7 +1696,7 @@ int cifs_revalidate_file(struct file *filp)
 	if (!cifs_inode_needs_reval(inode))
 		goto check_inval;
 
-	if (cfile->tcon->unix_ext)
+	if (tlink_tcon(cfile->tlink)->unix_ext)
 		rc = cifs_get_file_info_unix(filp);
 	else
 		rc = cifs_get_file_info(filp);
@@ -1817,7 +1817,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	if (open_file) {
 		__u16 nfid = open_file->netfid;
 		__u32 npid = open_file->pid;
-		pTcon = open_file->tcon;
+		pTcon = tlink_tcon(open_file->tlink);
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
 		cifsFileInfo_put(open_file);
@@ -1982,7 +1982,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if (open_file) {
 		u16 nfid = open_file->netfid;
 		u32 npid = open_file->pid;
-		pTcon = open_file->tcon;
+		pTcon = tlink_tcon(open_file->tlink);
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
 		cifsFileInfo_put(open_file);
 	} else {
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index cc70a61a47d2..077bf756f342 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,7 +38,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
 	struct cifsFileInfo *pSMBFile = filep->private_data;
-	struct cifsTconInfo *tcon = pSMBFile->tcon;
+	struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
 	__u64	ExtAttrBits = 0;
 	__u64	ExtAttrMask = 0;
 	__u64   caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index b38fe6704ad2..85cdbf831e7b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -346,7 +346,6 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 		goto cifs_hl_exit;
 	}
 
-/*	if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
 	if (pTcon->unix_ext)
 		rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
 					    cifs_sb->local_nls,
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 170047cf4522..1f0bd0f972d4 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -245,7 +245,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = true;
 	cifsFile->srch_inf.endOfSearch = false;
-	cifsFile->tcon = pTcon;
+	cifsFile->tlink = cifs_get_tlink(tlink);
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
@@ -838,7 +838,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		} */
 
-		pTcon = cifsFile->tcon;
+		pTcon = tlink_tcon(cifsFile->tlink);
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
 		if (rc) {
-- 
cgit v1.2.3


From 6508d904e6fb66ce4c34617f72b38d6714c4b9f6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 29 Sep 2010 19:51:11 -0400
Subject: cifs: have find_readable/writable_file filter by fsuid

When we implement multiuser mounts, we'll need to filter filehandles
by fsuid. Add a flag for multiuser mounts and code to filter by
fsuid when it's set.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |  3 ++-
 fs/cifs/cifsacl.c    |  4 ++--
 fs/cifs/cifsproto.h  |  4 ++--
 fs/cifs/dir.c        |  1 +
 fs/cifs/file.c       | 33 +++++++++++++++++++++++++--------
 fs/cifs/inode.c      |  6 +++---
 6 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ba0afd3acff4..e04e6923d354 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -37,6 +37,7 @@
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 #define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
 #define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
+#define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 
 struct cifs_sb_info {
 	struct cifsTconInfo *ptcon;	/* primary mount */
@@ -48,7 +49,7 @@ struct cifs_sb_info {
 	gid_t	mnt_gid;
 	mode_t	mnt_file_mode;
 	mode_t	mnt_dir_mode;
-	int     mnt_cifs_flags;
+	unsigned int mnt_cifs_flags;
 	int	prepathlen;
 	char   *prepath; /* relative path under the share to mount to */
 #ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 2647ea410c4c..c9b4792ae825 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -615,7 +615,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	struct cifsFileInfo *open_file = NULL;
 
 	if (inode)
-		open_file = find_readable_file(CIFS_I(inode));
+		open_file = find_readable_file(CIFS_I(inode), true);
 	if (!open_file)
 		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
 
@@ -685,7 +685,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 
 	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
 
-	open_file = find_readable_file(CIFS_I(inode));
+	open_file = find_readable_file(CIFS_I(inode), true);
 	if (!open_file)
 		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7294723d0625..29a2ee8ae51f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,9 +78,9 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
 				  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 #endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5adf47f28fed..e249b561ce8f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -144,6 +144,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 
 	pCifsFile->netfid = fileHandle;
 	pCifsFile->pid = current->tgid;
+	pCifsFile->uid = current_fsuid();
 	pCifsFile->pInode = igrab(newinode);
 	pCifsFile->mnt = mnt;
 	pCifsFile->pfile = file;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 24332d437150..80856f180711 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1168,9 +1168,15 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 }
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
 {
 	struct cifsFileInfo *open_file = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
 
 	read_lock(&GlobalSMBSeslock);
 	/* we could simply get the first_list_entry since write-only entries
@@ -1179,6 +1185,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (open_file->closePend)
 			continue;
+		if (fsuid_only && open_file->uid != current_fsuid())
+			continue;
 		if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
 		    (open_file->pfile->f_flags & O_RDONLY))) {
 			if (!open_file->invalidHandle) {
@@ -1198,9 +1206,11 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 }
 #endif
 
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
 {
 	struct cifsFileInfo *open_file;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
 	bool any_available = false;
 	int rc;
 
@@ -1214,13 +1224,19 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 		return NULL;
 	}
 
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
+
 	read_lock(&GlobalSMBSeslock);
 refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend ||
-		    (!any_available && open_file->pid != current->tgid))
+		if (open_file->closePend)
+			continue;
+		if (!any_available && open_file->pid != current->tgid)
+			continue;
+		if (fsuid_only && open_file->uid != current_fsuid())
 			continue;
-
 		if (open_file->pfile &&
 		    ((open_file->pfile->f_flags & O_RDWR) ||
 		     (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1315,7 +1331,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	if (mapping->host->i_size - offset < (loff_t)to)
 		to = (unsigned)(mapping->host->i_size - offset);
 
-	open_file = find_writable_file(CIFS_I(mapping->host));
+	open_file = find_writable_file(CIFS_I(mapping->host), false);
 	if (open_file) {
 		bytes_written = cifs_write(open_file->pfile, write_data,
 					   to-from, &offset);
@@ -1388,7 +1404,7 @@ static int cifs_writepages(struct address_space *mapping,
 	 * but it'll at least handle the return. Maybe it should be
 	 * a BUG() instead?
 	 */
-	open_file = find_writable_file(CIFS_I(mapping->host));
+	open_file = find_writable_file(CIFS_I(mapping->host), false);
 	if (!open_file) {
 		kfree(iov);
 		return generic_writepages(mapping, wbc);
@@ -1505,7 +1521,8 @@ retry:
 				break;
 		}
 		if (n_iov) {
-			open_file = find_writable_file(CIFS_I(mapping->host));
+			open_file = find_writable_file(CIFS_I(mapping->host),
+							false);
 			if (!open_file) {
 				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a39a1c451733..df29a3a3d80c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -963,7 +963,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	/*
 	 * If the file is already open for write, just use that fileid
 	 */
-	open_file = find_writable_file(cifsInode);
+	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
 		netfid = open_file->netfid;
 		netpid = open_file->pid;
@@ -1813,7 +1813,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	 * writebehind data than the SMB timeout for the SetPathInfo
 	 * request would allow
 	 */
-	open_file = find_writable_file(cifsInode);
+	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
 		__u16 nfid = open_file->netfid;
 		__u32 npid = open_file->pid;
@@ -1978,7 +1978,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		args->ctime = NO_CHANGE_64;
 
 	args->device = 0;
-	open_file = find_writable_file(cifsInode);
+	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
 		u16 nfid = open_file->netfid;
 		u32 npid = open_file->pid;
-- 
cgit v1.2.3


From 29e07c82a9e8acebbb38ecc22b0b5005a0a5d839 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 29 Sep 2010 19:51:12 -0400
Subject: cifs: fix cifs_show_options to show "username=" or "multiuser"

...based on CIFS_MOUNT_MULTIUSER flag.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b2fd075dc2e6..c96345c3314d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -369,8 +369,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
 	seq_printf(s, ",unc=%s", tcon->treeName);
-	if (tcon->ses->userName)
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
+		seq_printf(s, ",multiuser");
+	else if (tcon->ses->userName)
 		seq_printf(s, ",username=%s", tcon->ses->userName);
+
 	if (tcon->ses->domainName)
 		seq_printf(s, ",domain=%s", tcon->ses->domainName);
 
-- 
cgit v1.2.3


From c9928f7040a6e5f39e028bea500e0fde910d4a96 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Mon, 4 Oct 2010 19:56:13 -0500
Subject: ntlm authentication and signing - Correct response length for ntlmv2
 authentication without extended security

Fix incorrect calculation of case sensitive response length in the
ntlmv2 (without extended security) response.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index af18a500f7e0..c926e6c7c0c6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -739,9 +739,6 @@ ssetup_ntlmssp_authenticate:
 		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
 		/*	cpu_to_le16(LM2_SESS_KEY_SIZE); */
 
-		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(sizeof(struct ntlmv2_resp));
-
 		/* calculate session key */
 		rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
 		if (rc) {
@@ -753,6 +750,11 @@ ssetup_ntlmssp_authenticate:
 				sizeof(struct ntlmv2_resp));
 		bcc_ptr += sizeof(struct ntlmv2_resp);
 		kfree(v2_sess_key);
+		/* set case sensitive password length after tilen may get
+		 * assigned, tilen is 0 otherwise.
+		 */
+		pSMB->req_no_secext.CaseSensitivePasswordLength =
+			cpu_to_le16(sizeof(struct ntlmv2_resp) + ses->tilen);
 		if (ses->tilen > 0) {
 			memcpy(bcc_ptr, ses->tiblob, ses->tilen);
 			bcc_ptr += ses->tilen;
@@ -761,6 +763,7 @@ ssetup_ntlmssp_authenticate:
 			ses->tiblob = NULL;
 			ses->tilen = 0;
 		}
+
 		if (ses->capabilities & CAP_UNICODE) {
 			if (iov[0].iov_len % 2) {
 				*bcc_ptr = 0;
-- 
cgit v1.2.3


From 54b5187b5a1ad6573ade8b18e065dda92501fc52 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 15:26:08 -0700
Subject: ocfs2/cluster: Add heartbeat mode configfs parameter

Add heartbeat mode parameter to the configfs tree. This will be used
to set/show the heartbeat mode. The user is free to toggle the mode
between local and global as long as there is no active heartbeat region.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 72 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..4d36459a8343 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -77,7 +77,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
 
 #define O2HB_DEFAULT_BLOCK_BITS       9
 
+enum o2hb_heartbeat_modes {
+	O2HB_HEARTBEAT_LOCAL		= 0,
+	O2HB_HEARTBEAT_GLOBAL,
+	O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+		"local",	/* O2HB_HEARTBEAT_LOCAL */
+		"global",	/* O2HB_HEARTBEAT_GLOBAL */
+};
+
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
 
 /* Only sets a new threshold if there are no active regions.
  *
@@ -94,6 +106,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
 	}
 }
 
+static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+{
+	int ret = -1;
+
+	if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions)) {
+			o2hb_heartbeat_mode = hb_mode;
+			ret = 0;
+		}
+		spin_unlock(&o2hb_live_lock);
+	}
+
+	return ret;
+}
+
 struct o2hb_node_event {
 	struct list_head        hn_item;
 	enum o2hb_callback_type hn_event_type;
@@ -1688,6 +1716,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
 	return count;
 }
 
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+				       char *page)
+{
+	return sprintf(page, "%s\n",
+		       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+					const char *page, size_t count)
+{
+	unsigned int i;
+	int ret;
+	size_t len;
+
+	len = (page[count - 1] == '\n') ? count - 1 : count;
+	if (!len)
+		return -EINVAL;
+
+	for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+		if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+			continue;
+
+		ret = o2hb_global_hearbeat_mode_set(i);
+		if (!ret)
+			printk(KERN_NOTICE "ocfs2: Heartbeat mode set to %s\n",
+			       o2hb_heartbeat_mode_desc[i]);
+		return count;
+	}
+
+	return -EINVAL;
+
+}
+
 static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
 	.attr	= { .ca_owner = THIS_MODULE,
 		    .ca_name = "dead_threshold",
@@ -1696,8 +1759,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
 	.store	= o2hb_heartbeat_group_threshold_store,
 };
 
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+	.attr   = { .ca_owner = THIS_MODULE,
+		.ca_name = "mode",
+		.ca_mode = S_IRUGO | S_IWUSR },
+	.show   = o2hb_heartbeat_group_mode_show,
+	.store  = o2hb_heartbeat_group_mode_store,
+};
+
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
 	&o2hb_heartbeat_group_attr_threshold.attr,
+	&o2hb_heartbeat_group_attr_mode.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From 98f486f23bc5b6a6fa90e1a0707b7e9fe0e7f3e4 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Sat, 9 Oct 2010 10:24:46 -0700
Subject: ocfs2: Add an incompat feature flag
 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO

OCFS2_FEATURE_INCOMPAT_CLUSTERINFO allows us to use sb->s_cluster_info for
both userspace and o2cb cluster stacks. It also allows us to extend cluster
info to include stack flags.

This patch also adds stackflags to sb->s_clusterinfo. It also introduces a
clusterinfo flag OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT to denote the enabled
global heartbeat mode.

This incompat flag can be set/cleared using tunefs.ocfs2 --fs-features. The
clusterinfo flag is set/cleared using tunefs.ocfs2 --update-cluster-stack.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/ocfs2.h    | 31 +++++++++++++++++++++++++++++--
 fs/ocfs2/ocfs2_fs.h | 40 ++++++++++++++++++++++++++++++++++------
 fs/ocfs2/super.c    |  4 +++-
 3 files changed, 66 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..d5496a792bdb 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -368,6 +368,8 @@ struct ocfs2_super
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
 
+	u8 osb_stackflags;
+
 	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 	struct ocfs2_cluster_connection *cconn;
 	struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +603,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
 	return ret;
 }
 
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
 {
 	return (osb->s_feature_incompat &
-		OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+		(OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+		 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+	return ocfs2_o2cb_stack(osb) &&
+		(osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
 }
 
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index fa31d05e41b7..d5b1d99abc3c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
 					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
 					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
-					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -169,6 +170,13 @@
 /* Discontigous block groups */
 #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	0x2000
 
+/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -292,10 +300,13 @@
 #define OCFS2_VOL_UUID_LEN		16
 #define OCFS2_MAX_VOL_LABEL_LEN		64
 
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
 #define OCFS2_STACK_LABEL_LEN		4
 #define OCFS2_CLUSTER_NAME_LEN		16
 
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK	"o2cb"
+
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
 
@@ -305,6 +316,11 @@
  */
 #define OCFS2_MIN_XATTR_INLINE_SIZE     256
 
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT	(0x01)
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
@@ -566,9 +582,21 @@ struct ocfs2_slot_map_extended {
  */
 };
 
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
 struct ocfs2_cluster_info {
 /*00*/	__u8   ci_stack[OCFS2_STACK_LABEL_LEN];
-	__le32 ci_reserved;
+	union {
+		__le32 ci_reserved;
+		struct {
+			__u8 ci_stackflags;
+			__u8 ci_reserved1;
+			__u8 ci_reserved2;
+			__u8 ci_reserved3;
+		};
+	};
 /*08*/	__u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
 /*18*/
 };
@@ -605,9 +633,9 @@ struct ocfs2_super_block {
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
 /*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
-/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
-						     stack.  Only valid
-						     with INCOMPAT flag. */
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+						     userspace or clusterinfo
+						     INCOMPAT flag set. */
 /*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
 					   for this fs*/
 	__le16 s_reserved0;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..755431739396 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2149,7 +2149,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	if (ocfs2_userspace_stack(osb)) {
+	if (ocfs2_clusterinfo_valid(osb)) {
+		osb->osb_stackflags =
+			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
 		memcpy(osb->osb_cluster_stack,
 		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
 		       OCFS2_STACK_LABEL_LEN);
-- 
cgit v1.2.3


From 2c442719e90a44a6982c033d69df4aae4b167cfa Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 15:23:50 -0700
Subject: ocfs2: Add support for heartbeat=global mount option

Adds support for heartbeat=global mount option. It ensures that the heartbeat
mode passed matches the one enabled on disk.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/ocfs2.h    |  4 +++-
 fs/ocfs2/ocfs2_fs.h |  1 +
 fs/ocfs2/super.c    | 55 +++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 45 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d5496a792bdb..481387b90b21 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -243,7 +243,7 @@ enum ocfs2_local_alloc_state
 
 enum ocfs2_mount_options
 {
-	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+	OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
 	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
 	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +256,8 @@ enum ocfs2_mount_options
 						   control lists */
 	OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
 	OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+	OCFS2_MOUNT_HB_NONE = 1 << 12, /* No heartbeat */
+	OCFS2_MOUNT_HB_GLOBAL = 1 << 13, /* Global heartbeat */
 };
 
 #define OCFS2_OSB_SOFT_RO			0x0001
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index d5b1d99abc3c..28ff536b4f8d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -376,6 +376,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 /* Parameter passed from mount.ocfs2 to module */
 #define OCFS2_HB_NONE			"heartbeat=none"
 #define OCFS2_HB_LOCAL			"heartbeat=local"
+#define OCFS2_HB_GLOBAL			"heartbeat=global"
 
 /*
  * OCFS2 directory file types.  Only the low 3 bits are used.  The
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 755431739396..4e009ad303a1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -162,6 +162,7 @@ enum {
 	Opt_nointr,
 	Opt_hb_none,
 	Opt_hb_local,
+	Opt_hb_global,
 	Opt_data_ordered,
 	Opt_data_writeback,
 	Opt_atime_quantum,
@@ -190,6 +191,7 @@ static const match_table_t tokens = {
 	{Opt_nointr, "nointr"},
 	{Opt_hb_none, OCFS2_HB_NONE},
 	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_hb_global, OCFS2_HB_GLOBAL},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_atime_quantum, "atime_quantum=%u"},
@@ -608,6 +610,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	int ret = 0;
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u32 tmp;
 
 	lock_kernel();
 
@@ -617,8 +620,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		goto out;
 	}
 
-	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
-	    (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+	tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+		OCFS2_MOUNT_HB_NONE;
+	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
 		goto out;
@@ -809,23 +813,29 @@ bail:
 
 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
 {
-	if (ocfs2_mount_local(osb)) {
-		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+	u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+	if (osb->s_mount_opt & hb_enabled) {
+		if (ocfs2_mount_local(osb)) {
 			mlog(ML_ERROR, "Cannot heartbeat on a locally "
 			     "mounted device.\n");
 			return -EINVAL;
 		}
-	}
-
-	if (ocfs2_userspace_stack(osb)) {
-		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+		if (ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Userspace stack expected, but "
 			     "o2cb heartbeat arguments passed to mount\n");
 			return -EINVAL;
 		}
+		if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+		     !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+		    ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+		     ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+			mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+			return -EINVAL;
+		}
 	}
 
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+	if (!(osb->s_mount_opt & hb_enabled)) {
 		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
 		    !ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1291,6 +1301,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 {
 	int status;
 	char *p;
+	u32 tmp;
 
 	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
 		   options ? options : "(none)");
@@ -1322,7 +1333,10 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
 			break;
 		case Opt_hb_none:
-			mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+			mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+			break;
+		case Opt_hb_global:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
 			break;
 		case Opt_barrier:
 			if (match_int(&args[0], &option)) {
@@ -1477,6 +1491,15 @@ static int ocfs2_parse_options(struct super_block *sb,
 		}
 	}
 
+	/* Ensure only one heartbeat mode */
+	tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+				 OCFS2_MOUNT_HB_NONE);
+	if (hweight32(tmp) != 1) {
+		mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+		status = 0;
+		goto bail;
+	}
+
 	status = 1;
 
 bail:
@@ -1490,10 +1513,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	unsigned long opts = osb->s_mount_opt;
 	unsigned int local_alloc_megs;
 
-	if (opts & OCFS2_MOUNT_HB_LOCAL)
-		seq_printf(s, ",_netdev,heartbeat=local");
-	else
-		seq_printf(s, ",heartbeat=none");
+	if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+		seq_printf(s, ",_netdev");
+		if (opts & OCFS2_MOUNT_HB_LOCAL)
+			seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+		else
+			seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+	} else
+		seq_printf(s, ",%s", OCFS2_HB_NONE);
 
 	if (opts & OCFS2_MOUNT_NOINTR)
 		seq_printf(s, ",nointr");
-- 
cgit v1.2.3


From b1365d0bd14b912cceb424cbeed9fe939a9038e3 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:34 -0700
Subject: ocfs2/dlm: Expose dlm_protocol in dlm_state

Add dlm_protocol to the list of info shown by the debugfs file, dlm_state.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 901ca52bf86b..f693ab812f3e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -782,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 
 	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
 	out += snprintf(db->buf + out, db->len - out,
-			"Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+			"Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
+			dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+			dlm->dlm_locking_proto.pv_minor);
 
 	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
 	out += snprintf(db->buf + out, db->len - out,
-- 
cgit v1.2.3


From b3c85c4cdf77154acc940dd0f14d1fb99cbbaf75 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 14:31:06 -0700
Subject: ocfs2/cluster: Get all heartbeat regions

Export function in o2hb to get a list of heartbeat regions. It also adds an
upper limit to the length of the heartbeat region name.

o2hb_global_heartbeat_active() currently disables global heartbeat. It will
be enabled in a later patch after all the code is added.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 34 ++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/heartbeat.h |  4 ++++
 2 files changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4d36459a8343..3415e58ff77b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1623,6 +1623,9 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
 	if (reg == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
 	spin_lock(&o2hb_live_lock);
@@ -2035,3 +2038,34 @@ void o2hb_stop_all_regions(void)
 	spin_unlock(&o2hb_live_lock);
 }
 EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+	struct o2hb_region *reg;
+	int numregs = 0;
+	char *p;
+
+	spin_lock(&o2hb_live_lock);
+
+	p = region_uuids;
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+		if (numregs < max_regions) {
+			memcpy(p, config_item_name(&reg->hr_item),
+			       O2HB_MAX_REGION_NAME_LEN);
+			p += O2HB_MAX_REGION_NAME_LEN;
+		}
+		numregs++;
+	}
+
+	spin_unlock(&o2hb_live_lock);
+
+	return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
 
 #define O2HB_REGION_TIMEOUT_MS		2000
 
+#define O2HB_MAX_REGION_NAME_LEN	32
+
 /* number of changes to be seen as live */
 #define O2HB_LIVE_THRESHOLD	   2
 /* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
 
 #endif /* O2CLUSTER_HEARTBEAT_H */
-- 
cgit v1.2.3


From ea2034416b54700e30371f2ad6517cbb94674083 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Sat, 9 Oct 2010 10:26:23 -0700
Subject: ocfs2/dlm: Add message DLM_QUERY_REGION

Adds new dlm message DLM_QUERY_REGION that sends the names of all active
heartbeat regions. This message is only sent in the global heartbeat
mode. If the regions in the joining node do not fully match the ones in
the active nodes, the join domain request is rejected.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/ocfs2_nodemanager.h |   6 +
 fs/ocfs2/dlm/dlmcommon.h             |  12 +-
 fs/ocfs2/dlm/dlmdomain.c             | 218 +++++++++++++++++++++++++++++++++++
 3 files changed, 235 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
 /* host name, group name, cluster name all 64 bytes */
 #define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
 
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION**  Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS	32
+
 #endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 765298908f1d..aa506d3e2ae6 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,8 @@ enum {
 	DLM_LOCK_REQUEST_MSG,	 /* 515 */
 	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
 	DLM_BEGIN_RECO_MSG,	 /* 517 */
-	DLM_FINALIZE_RECO_MSG	 /* 518 */
+	DLM_FINALIZE_RECO_MSG,	 /* 518 */
+	DLM_QUERY_REGION,	 /* 519 */
 };
 
 struct dlm_reco_node_data
@@ -727,6 +728,15 @@ struct dlm_cancel_join
 	u8 domain[O2NM_MAX_NAME_LEN];
 };
 
+struct dlm_query_region {
+	u8 qr_node;
+	u8 qr_numregions;
+	u8 qr_namelen;
+	u8 pad1;
+	u8 qr_domain[O2NM_MAX_NAME_LEN];
+	u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
 struct dlm_exit_domain
 {
 	u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 11a5c87fd7f7..49650756dfef 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,6 +128,9 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  * will have a negotiated version with the same major number and a minor
  * number equal or smaller.  The dlm_ctxt->dlm_locking_proto field should
  * be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ *	- Message DLM_QUERY_REGION added to support global heartbeat
  */
 static const struct dlm_protocol_version dlm_protocol = {
 	.pv_major = 1,
@@ -142,6 +145,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 				     void **ret_data);
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data);
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data);
 static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -921,6 +926,203 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 	return 0;
 }
 
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+			     struct dlm_query_region *qr)
+{
+	char *local = NULL, *remote = qr->qr_regions;
+	char *l, *r;
+	int localnr, i, j, foundit;
+	int status = 0;
+
+	if (!o2hb_global_heartbeat_active()) {
+		if (qr->qr_numregions) {
+			mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+			     "heartbeat enabled but local node %d does not\n",
+			     qr->qr_domain, qr->qr_node, dlm->node_num);
+			status = -EINVAL;
+		}
+		goto bail;
+	}
+
+	if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+		mlog(ML_ERROR, "Domain %s: Local node %d has global "
+		     "heartbeat enabled but joining node %d does not\n",
+		     qr->qr_domain, dlm->node_num, qr->qr_node);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+	if (!local) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
+
+	/* compare local regions with remote */
+	l = local;
+	for (i = 0; i < localnr; ++i) {
+		foundit = 0;
+		r = remote;
+		for (j = 0; j <= qr->qr_numregions; ++j) {
+			if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			r += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in local node %d but not in joining node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+			     dlm->node_num, qr->qr_node);
+			goto bail;
+		}
+		l += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	/* compare remote with local regions */
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		foundit = 0;
+		l = local;
+		for (j = 0; j < localnr; ++j) {
+			if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			l += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in joining node %d but not in local node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+			     qr->qr_node, dlm->node_num);
+			goto bail;
+		}
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+bail:
+	kfree(local);
+
+	return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_region *qr = NULL;
+	int status, ret = 0, i;
+	char *p;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+	if (!qr) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	qr->qr_node = dlm->node_num;
+	qr->qr_namelen = strlen(dlm->name);
+	memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+	/* if local hb, the numregions will be zero */
+	if (o2hb_global_heartbeat_active())
+		qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+							 O2NM_MAX_REGIONS);
+
+	p = qr->qr_regions;
+	for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending regions to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+					 sizeof(struct dlm_query_region),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+			     ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qr);
+	return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data)
+{
+	struct dlm_query_region *qr;
+	struct dlm_ctxt *dlm = NULL;
+	int status = 0;
+	int locked = 0;
+
+	qr = (struct dlm_query_region *) msg->buf;
+
+	mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+	     qr->qr_domain);
+
+	status = -EINVAL;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "before join domain\n", qr->qr_node, qr->qr_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qr->qr_node) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for global heartbeat was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but active dlm protocol is %d.%d\n", qr->qr_node,
+		     qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_regions(dlm, qr);
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	return status;
+}
+
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data)
 {
@@ -1241,6 +1443,15 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	set_bit(dlm->node_num, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 
+	/* Support for global heartbeat was added in 1.1 */
+	if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
+		status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
 
 	/* Joined state *must* be set before the joining node
@@ -1807,6 +2018,13 @@ static int dlm_register_net_handlers(void)
 					sizeof(struct dlm_cancel_join),
 					dlm_cancel_join_handler,
 					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+					sizeof(struct dlm_query_region),
+					dlm_query_region_handler,
+					NULL, NULL, &dlm_join_handlers);
 
 bail:
 	if (status < 0)
-- 
cgit v1.2.3


From 5f3c6d9c615770708df464c170316f83cf437242 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:29 -0700
Subject: ocfs2: Print message if user mounts without starting global heartbeat

In global heartbeat mode, the heartbeat is started by the user. This patch
prints an error if the user attempts to mount a volume without starting the
heartbeat.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/stack_o2cb.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	/* for now we only have one cluster/node, make sure we see it
 	 * in the heartbeat universe */
 	if (!o2hb_check_local_node_heartbeating()) {
+		if (o2hb_global_heartbeat_active())
+			mlog(ML_ERROR, "Global heartbeat not started\n");
 		rc = -EINVAL;
 		goto out;
 	}
-- 
cgit v1.2.3


From 18cfdf1b1a8e83b09e4185c02396257ce7e7bee3 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 16:47:03 -0700
Subject: ocfs2/dlm: Add message DLM_QUERY_NODEINFO

Adds new dlm message DLM_QUERY_NODEINFO that sends the attributes of all
registered nodes. This message is sent if the negotiated dlm protocol is
1.1 or higher. If the information of the joining node does not match
that of any existing nodes, the join domain request is rejected.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h |  17 +++++
 fs/ocfs2/dlm/dlmdomain.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 198 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index aa506d3e2ae6..b36d0bf77a5a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -447,6 +447,7 @@ enum {
 	DLM_BEGIN_RECO_MSG,	 /* 517 */
 	DLM_FINALIZE_RECO_MSG,	 /* 518 */
 	DLM_QUERY_REGION,	 /* 519 */
+	DLM_QUERY_NODEINFO,	 /* 520 */
 };
 
 struct dlm_reco_node_data
@@ -737,6 +738,22 @@ struct dlm_query_region {
 	u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
 };
 
+struct dlm_node_info {
+	u8 ni_nodenum;
+	u8 pad1;
+	u16 ni_ipv4_port;
+	u32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+	u8 qn_nodenum;
+	u8 qn_numnodes;
+	u8 qn_namelen;
+	u8 pad1;
+	u8 qn_domain[O2NM_MAX_NAME_LEN];
+	struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
 struct dlm_exit_domain
 {
 	u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 49650756dfef..78d428f5e10e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -131,6 +131,7 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  *
  * New in version 1.1:
  *	- Message DLM_QUERY_REGION added to support global heartbeat
+ *	- Message DLM_QUERY_NODEINFO added to allow online node removes
  */
 static const struct dlm_protocol_version dlm_protocol = {
 	.pv_major = 1,
@@ -1123,6 +1124,173 @@ bail:
 	return status;
 }
 
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+	struct o2nm_node *local;
+	struct dlm_node_info *remote;
+	int i, j;
+	int status = 0;
+
+	for (j = 0; j < qn->qn_numnodes; ++j)
+		mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+		     &(qn->qn_nodes[j].ni_ipv4_address),
+		     ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+	for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+		local = o2nm_get_node_by_num(i);
+		remote = NULL;
+		for (j = 0; j < qn->qn_numnodes; ++j) {
+			if (qn->qn_nodes[j].ni_nodenum == i) {
+				remote = &(qn->qn_nodes[j]);
+				break;
+			}
+		}
+
+		if (!local && !remote)
+			continue;
+
+		if ((local && !remote) || (!local && remote))
+			status = -EINVAL;
+
+		if (!status &&
+		    ((remote->ni_nodenum != local->nd_num) ||
+		     (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+		     (remote->ni_ipv4_address != local->nd_ipv4_address)))
+			status = -EINVAL;
+
+		if (status) {
+			if (remote && !local)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in joining node %d but not in "
+				     "local node %d\n", qn->qn_domain,
+				     remote->ni_nodenum,
+				     &(remote->ni_ipv4_address),
+				     ntohs(remote->ni_ipv4_port),
+				     qn->qn_nodenum, dlm->node_num);
+			if (local && !remote)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in local node %d but not in "
+				     "joining node %d\n", qn->qn_domain,
+				     local->nd_num, &(local->nd_ipv4_address),
+				     ntohs(local->nd_ipv4_port),
+				     dlm->node_num, qn->qn_nodenum);
+			BUG_ON((!local && !remote));
+		}
+
+		if (local)
+			o2nm_node_put(local);
+	}
+
+	return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_nodeinfo *qn = NULL;
+	struct o2nm_node *node;
+	int ret = 0, status, count, i;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+	if (!qn) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+		node = o2nm_get_node_by_num(i);
+		if (!node)
+			continue;
+		qn->qn_nodes[count].ni_nodenum = node->nd_num;
+		qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+		qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+		mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+		     &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+		++count;
+		o2nm_node_put(node);
+	}
+
+	qn->qn_nodenum = dlm->node_num;
+	qn->qn_numnodes = count;
+	qn->qn_namelen = strlen(dlm->name);
+	memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending nodeinfo to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					 qn, sizeof(struct dlm_query_nodeinfo),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qn);
+	return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+				      void *data, void **ret_data)
+{
+	struct dlm_query_nodeinfo *qn;
+	struct dlm_ctxt *dlm = NULL;
+	int locked = 0, status = -EINVAL;
+
+	qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+	mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+	     qn->qn_domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+		     "join domain\n", qn->qn_nodenum, qn->qn_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qn->qn_nodenum) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+		     "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for node query was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+		     "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+		     qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_nodes(dlm, qn);
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	return status;
+}
+
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data)
 {
@@ -1443,8 +1611,13 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	set_bit(dlm->node_num, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 
-	/* Support for global heartbeat was added in 1.1 */
+	/* Support for global heartbeat and node info was added in 1.1 */
 	if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
+		status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
 		status = dlm_send_regions(dlm, ctxt->yes_resp_map);
 		if (status) {
 			mlog_errno(status);
@@ -2026,6 +2199,13 @@ static int dlm_register_net_handlers(void)
 					dlm_query_region_handler,
 					NULL, NULL, &dlm_join_handlers);
 
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					sizeof(struct dlm_query_nodeinfo),
+					dlm_query_nodeinfo_handler,
+					NULL, NULL, &dlm_join_handlers);
 bail:
 	if (status < 0)
 		dlm_unregister_net_handlers();
-- 
cgit v1.2.3


From 18c50cb0d3c293eabd6c2ef89c43f2a968e709ed Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 18:26:59 -0700
Subject: ocfs2/cluster: Print messages when adding/removing heartbeat regions

Prints messages when the user adds or removes heartbeat regions in global
heartbeat mode. These messages are useful when debugging cluster related issues.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 3415e58ff77b..12bb12ba8640 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1476,6 +1476,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	else
 		ret = -EIO;
 
+	if (hb_task && o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+		       config_item_name(&reg->hr_item));
+
 out:
 	if (filp)
 		fput(filp);
@@ -1659,6 +1663,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 		wake_up(&o2hb_steady_queue);
 	}
 
+	if (o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
+		       config_item_name(&reg->hr_item));
 	config_item_put(item);
 }
 
@@ -1745,7 +1752,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
 
 		ret = o2hb_global_hearbeat_mode_set(i);
 		if (!ret)
-			printk(KERN_NOTICE "ocfs2: Heartbeat mode set to %s\n",
+			printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
 			       o2hb_heartbeat_mode_desc[i]);
 		return count;
 	}
-- 
cgit v1.2.3


From 39a298563e0619b1b6e2e0974e58801de780621c Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 17:30:17 -0700
Subject: ocfs2/cluster: Print messages when adding/removing nodes

Prints messages when the user adds or removes nodes.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/masklog.h     | 3 ++-
 fs/ocfs2/cluster/nodemanager.c | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..ea2ed9f56c94 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
-#define	ML_RESERVATIONS	0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_RESERVATIONS	0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER	0x0000001000000000ULL /* cluster stack */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
+	mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
 	return &node->nd_item;
 }
 
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
 	}
 	write_unlock(&cluster->cl_nodes_lock);
 
+	mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+	     config_item_name(&node->nd_item));
+
 	config_item_put(item);
 }
 
-- 
cgit v1.2.3


From 0e105d37c2adb19cb777aa6701a866f211764a30 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 17:00:16 -0700
Subject: ocfs2/cluster: Check slots for unconfigured live nodes

o2hb currently checks slots for configured nodes only. This patch makes
it check the slots for the live nodes too to take care of a race in which
a node is removed from the configuration but not from the live map.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 38 +++++++++++++++++++++++++++++++-------
 fs/ocfs2/cluster/tcp.c       |  5 +++++
 2 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 12bb12ba8640..a8f10649674d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -541,6 +541,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
 {
 	assert_spin_locked(&o2hb_live_lock);
 
+	BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
 	event->hn_event_type = type;
 	event->hn_node = node;
 	event->hn_node_num = node_num;
@@ -593,14 +595,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 	u64 cputime;
 	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
 	unsigned int slot_dead_ms;
+	int tmp;
 
 	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
 
-	/* Is this correct? Do we assume that the node doesn't exist
-	 * if we're not configured for him? */
+	/*
+	 * If a node is no longer configured but is still in the livemap, we
+	 * may need to clear that bit from the livemap.
+	 */
 	node = o2nm_get_node_by_num(slot->ds_node_num);
-	if (!node)
-		return 0;
+	if (!node) {
+		spin_lock(&o2hb_live_lock);
+		tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+		spin_unlock(&o2hb_live_lock);
+		if (!tmp)
+			return 0;
+	}
 
 	if (!o2hb_verify_crc(reg, hb_block)) {
 		/* all paths from here will drop o2hb_live_lock for
@@ -717,8 +727,9 @@ fire_callbacks:
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
 			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
 
-			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
-					      slot->ds_node_num);
+			/* node can be null */
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+					      node, slot->ds_node_num);
 
 			changed = 1;
 		}
@@ -738,7 +749,8 @@ out:
 
 	o2hb_run_event_list(&event);
 
-	o2nm_node_put(node);
+	if (node)
+		o2nm_node_put(node);
 	return changed;
 }
 
@@ -765,6 +777,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
 	int i, ret, highest_node, change = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct o2hb_bio_wait_ctxt write_wc;
 
 	ret = o2nm_configured_node_map(configured_nodes,
@@ -774,6 +787,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		return ret;
 	}
 
+	/*
+	 * If a node is not configured but is in the livemap, we still need
+	 * to read the slot so as to be able to remove it from the livemap.
+	 */
+	o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+	i = -1;
+	while ((i = find_next_bit(live_node_bitmap,
+				  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+		set_bit(i, configured_nodes);
+	}
+
 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
 	if (highest_node >= O2NM_MAX_NODES) {
 		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index cbe2f057cc28..9aa426e42123 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
 {
 	o2quo_hb_down(node_num);
 
+	if (!node)
+		return;
+
 	if (node_num != o2nm_this_node())
 		o2net_disconnect_node(node);
 
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 
 	o2quo_hb_up(node_num);
 
+	BUG_ON(!node);
+
 	/* ensure an immediate connect attempt */
 	nn->nn_last_connect_attempt = jiffies -
 		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
-- 
cgit v1.2.3


From 8ca8b0bbd841b6bcd8ac05e51b0143aa61cfeff3 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 17:01:27 -0700
Subject: ocfs2/cluster: Reorganize o2hb debugfs init

o2hb debugfs handling is reorganized to allow for easy expansion.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 101 +++++++++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a8f10649674d..16e49765c853 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,8 +62,19 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
 
+#define O2HB_DB_TYPE_LIVENODES		0
+struct o2hb_debug_buf {
+	int db_type;
+	int db_size;
+	int db_len;
+	void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+
 #define O2HB_DEBUG_DIR			"o2hb"
 #define O2HB_DEBUG_LIVENODES		"livenodes"
+
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
 
@@ -969,21 +980,35 @@ static int o2hb_thread(void *data)
 #ifdef CONFIG_DEBUG_FS
 static int o2hb_debug_open(struct inode *inode, struct file *file)
 {
+	struct o2hb_debug_buf *db = inode->i_private;
 	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	char *buf = NULL;
 	int i = -1;
 	int out = 0;
 
+	/* max_nodes should be the largest bitmap we pass here */
+	BUG_ON(sizeof(map) < db->db_size);
+
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
 		goto bail;
 
-	o2hb_fill_node_map(map, sizeof(map));
+	switch (db->db_type) {
+	case O2HB_DB_TYPE_LIVENODES:
+		spin_lock(&o2hb_live_lock);
+		memcpy(map, db->db_data, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	default:
+		goto done;
+	}
 
-	while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+	while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
 		out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
 	out += snprintf(buf + out, PAGE_SIZE - out, "\n");
 
+done:
 	i_size_write(inode, out);
 
 	file->private_data = buf;
@@ -1030,10 +1055,56 @@ static const struct file_operations o2hb_debug_fops = {
 
 void o2hb_exit(void)
 {
-	if (o2hb_debug_livenodes)
-		debugfs_remove(o2hb_debug_livenodes);
-	if (o2hb_debug_dir)
-		debugfs_remove(o2hb_debug_dir);
+	kfree(o2hb_db_livenodes);
+	debugfs_remove(o2hb_debug_livenodes);
+	debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+					struct o2hb_debug_buf **db, int db_len,
+					int type, int size, int len, void *data)
+{
+	*db = kmalloc(db_len, GFP_KERNEL);
+	if (!*db)
+		return NULL;
+
+	(*db)->db_type = type;
+	(*db)->db_size = size;
+	(*db)->db_len = len;
+	(*db)->db_data = data;
+
+	return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+				   &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+	int ret = -ENOMEM;
+
+	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+	if (!o2hb_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+						 o2hb_debug_dir,
+						 &o2hb_db_livenodes,
+						 sizeof(*o2hb_db_livenodes),
+						 O2HB_DB_TYPE_LIVENODES,
+						 sizeof(o2hb_live_node_bitmap),
+						 O2NM_MAX_NODES,
+						 o2hb_live_node_bitmap);
+	if (!o2hb_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+	ret = 0;
+bail:
+	if (ret)
+		o2hb_exit();
+
+	return ret;
 }
 
 int o2hb_init(void)
@@ -1050,23 +1121,7 @@ int o2hb_init(void)
 
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
 
-	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
-	if (!o2hb_debug_dir) {
-		mlog_errno(-ENOMEM);
-		return -ENOMEM;
-	}
-
-	o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
-						   S_IFREG|S_IRUSR,
-						   o2hb_debug_dir, NULL,
-						   &o2hb_debug_fops);
-	if (!o2hb_debug_livenodes) {
-		mlog_errno(-ENOMEM);
-		debugfs_remove(o2hb_debug_dir);
-		return -ENOMEM;
-	}
-
-	return 0;
+	return o2hb_debug_init();
 }
 
 /* if we're already in a callback then we're already serialized by the sem */
-- 
cgit v1.2.3


From 823a637ae933fde8fdb280612dd3ff9912e301e3 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:21 -0700
Subject: ocfs2/cluster: Maintain live node bitmap per heartbeat region

Currently we track a global livenode bitmap that keeps track of all nodes
that are heartbeating in all regions.

This patch adds the ability to track the livenode bitmap on a per region basis.
We will use this facility in a later patch to allow us to withstand the loss of
a minority number of regions.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16e49765c853..188f50269b89 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -174,6 +174,9 @@ struct o2hb_region {
 	struct block_device	*hr_bdev;
 	struct o2hb_disk_slot	*hr_slots;
 
+	/* live node map of this region */
+	unsigned long		hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
 	 * a more complete api that doesn't lead to this sort of fragility. */
@@ -688,6 +691,8 @@ fire_callbacks:
 		mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
 		     slot->ds_node_num, (long long)slot->ds_last_generation);
 
+		set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
 		/* first on the list generates a callback */
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
 			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
@@ -733,6 +738,8 @@ fire_callbacks:
 		mlog(ML_HEARTBEAT, "Node %d left my region\n",
 		     slot->ds_node_num);
 
+		clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
 		/* last off the live_slot generates a callback */
 		list_del_init(&slot->ds_live_item);
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
-- 
cgit v1.2.3


From 536f0741f324f116d8b059295999945a2dac56bc Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 17:03:07 -0700
Subject: ocfs2/cluster: Track number of global heartbeat regions

In global heartbeat mode, we have a upper limit for the number of active regions.
This patch adds the facility to track the number of active global heartbeat
regions and fails to start heartbeat if the number exceeds the maximum.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 188f50269b89..d66b17c000d4 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,6 +62,12 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
 
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * 	- o2hb_region_bitmap allows us to limit the region number to max region.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
 #define O2HB_DB_TYPE_LIVENODES		0
 struct o2hb_debug_buf {
 	int db_type;
@@ -176,6 +182,7 @@ struct o2hb_region {
 
 	/* live node map of this region */
 	unsigned long		hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned int		hr_region_num;
 
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
@@ -1127,6 +1134,7 @@ int o2hb_init(void)
 	INIT_LIST_HEAD(&o2hb_node_events);
 
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
 
 	return o2hb_debug_init();
 }
@@ -1716,12 +1724,22 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
 	if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
-
 	spin_lock(&o2hb_live_lock);
+	reg->hr_region_num = 0;
+	if (o2hb_global_heartbeat_active()) {
+		reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+							 O2NM_MAX_REGIONS);
+		if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+			spin_unlock(&o2hb_live_lock);
+			return ERR_PTR(-EFBIG);
+		}
+		set_bit(reg->hr_region_num, o2hb_region_bitmap);
+	}
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
 
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
 	return &reg->hr_item;
 }
 
@@ -1733,6 +1751,8 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 
 	/* stop the thread when the user removes the region dir */
 	spin_lock(&o2hb_live_lock);
+	if (o2hb_global_heartbeat_active())
+		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
 	hb_task = reg->hr_task;
 	reg->hr_task = NULL;
 	spin_unlock(&o2hb_live_lock);
-- 
cgit v1.2.3


From e7d656baf6607a0775f4ca85464a4ead306741e5 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:18 -0700
Subject: ocfs2/cluster: Track bitmap of live heartbeat regions

A heartbeat region becomes live (or active) after a fixed number of (steady)
iterations.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index d66b17c000d4..2a7cd17e96f0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -65,8 +65,10 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
 /*
  * In global heartbeat, we maintain a series of region bitmaps.
  * 	- o2hb_region_bitmap allows us to limit the region number to max region.
+ * 	- o2hb_live_region_bitmap tracks live regions (seen steady iterations).
  */
 static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 
 #define O2HB_DB_TYPE_LIVENODES		0
 struct o2hb_debug_buf {
@@ -1135,6 +1137,7 @@ int o2hb_init(void)
 
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
 	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
 
 	return o2hb_debug_init();
 }
@@ -1563,6 +1566,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	/* Ok, we were woken.  Make sure it wasn't by drop_item() */
 	spin_lock(&o2hb_live_lock);
 	hb_task = reg->hr_task;
+	if (o2hb_global_heartbeat_active())
+		set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
 	spin_unlock(&o2hb_live_lock);
 
 	if (hb_task)
@@ -1751,8 +1756,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 
 	/* stop the thread when the user removes the region dir */
 	spin_lock(&o2hb_live_lock);
-	if (o2hb_global_heartbeat_active())
+	if (o2hb_global_heartbeat_active()) {
 		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+		clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+	}
 	hb_task = reg->hr_task;
 	reg->hr_task = NULL;
 	spin_unlock(&o2hb_live_lock);
-- 
cgit v1.2.3


From 43182d2a799865872041b6e4d8387131e9462f56 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:16 -0700
Subject: ocfs2/cluster: Maintain bitmap of quorum regions

o2hb allows online adding of regions. However, a newly added region is not
used in quorum calculations unless it has been added on all nodes. This patch
tracks a bitmap of such quorum regions.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 2a7cd17e96f0..62a8af271344 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -66,9 +66,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
  * In global heartbeat, we maintain a series of region bitmaps.
  * 	- o2hb_region_bitmap allows us to limit the region number to max region.
  * 	- o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * 	- o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * 		heartbeat on it.
  */
 static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 
 #define O2HB_DB_TYPE_LIVENODES		0
 struct o2hb_debug_buf {
@@ -607,6 +610,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
 	o2nm_node_put(node);
 }
 
+static void o2hb_set_quorum_device(struct o2hb_region *reg,
+				   struct o2hb_disk_slot *slot)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	if (!o2hb_global_heartbeat_active())
+		return;
+
+	if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+		return;
+
+	/*
+	 * A region can be added to the quorum only when it sees all
+	 * live nodes heartbeat on it. In other words, the region has been
+	 * added to all nodes.
+	 */
+	if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+		   sizeof(o2hb_live_node_bitmap)))
+		return;
+
+	if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
+		return;
+
+	printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+	       config_item_name(&reg->hr_item));
+
+	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+}
+
 static int o2hb_check_slot(struct o2hb_region *reg,
 			   struct o2hb_disk_slot *slot)
 {
@@ -772,6 +804,8 @@ fire_callbacks:
 		slot->ds_equal_samples = 0;
 	}
 out:
+	o2hb_set_quorum_device(reg, slot);
+
 	spin_unlock(&o2hb_live_lock);
 
 	o2hb_run_event_list(&event);
@@ -1138,6 +1172,7 @@ int o2hb_init(void)
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
 	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
 	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
 
 	return o2hb_debug_init();
 }
-- 
cgit v1.2.3


From b1c5ebfbe398b3360614a4788c02061cd153e60a Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 7 Oct 2010 17:05:52 -0700
Subject: ocfs2/cluster: Maintain bitmap of failed regions

In global heartbeat mode, we track the bitmap of regions that have seen
heartbeat timeouts. We fence if the number of such regions is greater than
or equal to half the number of quorum regions.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 62a8af271344..f890656127fa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -68,10 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
  * 	- o2hb_live_region_bitmap tracks live regions (seen steady iterations).
  * 	- o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
  * 		heartbeat on it.
+ * 	- o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
  */
 static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 
 #define O2HB_DB_TYPE_LIVENODES		0
 struct o2hb_debug_buf {
@@ -217,8 +219,19 @@ struct o2hb_bio_wait_ctxt {
 	int               wc_error;
 };
 
+static int o2hb_pop_count(void *map, int count)
+{
+	int i = -1, pop = 0;
+
+	while ((i = find_next_bit(map, count, i + 1)) < count)
+		pop++;
+	return pop;
+}
+
 static void o2hb_write_timeout(struct work_struct *work)
 {
+	int failed, quorum;
+	unsigned long flags;
 	struct o2hb_region *reg =
 		container_of(work, struct o2hb_region,
 			     hr_write_timeout_work.work);
@@ -226,6 +239,28 @@ static void o2hb_write_timeout(struct work_struct *work)
 	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
 	     "milliseconds\n", reg->hr_dev_name,
 	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock_irqsave(&o2hb_live_lock, flags);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+					O2NM_MAX_REGIONS);
+		quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+					O2NM_MAX_REGIONS);
+		spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+		mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+		     quorum, failed);
+
+		/*
+		 * Fence if the number of failed regions >= half the number
+		 * of  quorum regions
+		 */
+		if ((failed << 1) < quorum)
+			return;
+	}
+
 	o2quo_disk_timeout();
 }
 
@@ -234,6 +269,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 	mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
 	     O2HB_MAX_WRITE_TIMEOUT_MS);
 
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		spin_unlock(&o2hb_live_lock);
+	}
 	cancel_delayed_work(&reg->hr_write_timeout_work);
 	reg->hr_last_timeout_start = jiffies;
 	schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -1173,6 +1213,7 @@ int o2hb_init(void)
 	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
 	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
 	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
 
 	return o2hb_debug_init();
 }
-- 
cgit v1.2.3


From a6de013654b4839c8609e26241ebd9eb1ecc52e6 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:13 -0700
Subject: ocfs2/cluster: Create debugfs files for live, quorum and failed
 region bitmaps

This patch prints the bitmaps of live, quorum and failed regions. This
information will be useful in debugging cluster issues.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f890656127fa..b06b9e52fba8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -76,6 +76,9 @@ static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 
 #define O2HB_DB_TYPE_LIVENODES		0
+#define O2HB_DB_TYPE_LIVEREGIONS	1
+#define O2HB_DB_TYPE_QUORUMREGIONS	2
+#define O2HB_DB_TYPE_FAILEDREGIONS	3
 struct o2hb_debug_buf {
 	int db_type;
 	int db_size;
@@ -84,12 +87,21 @@ struct o2hb_debug_buf {
 };
 
 static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
 
 #define O2HB_DEBUG_DIR			"o2hb"
 #define O2HB_DEBUG_LIVENODES		"livenodes"
+#define O2HB_DEBUG_LIVEREGIONS		"live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS	"quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
 
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
 
 static LIST_HEAD(o2hb_all_regions);
 
@@ -1085,6 +1097,9 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
 
 	switch (db->db_type) {
 	case O2HB_DB_TYPE_LIVENODES:
+	case O2HB_DB_TYPE_LIVEREGIONS:
+	case O2HB_DB_TYPE_QUORUMREGIONS:
+	case O2HB_DB_TYPE_FAILEDREGIONS:
 		spin_lock(&o2hb_live_lock);
 		memcpy(map, db->db_data, db->db_size);
 		spin_unlock(&o2hb_live_lock);
@@ -1146,6 +1161,12 @@ static const struct file_operations o2hb_debug_fops = {
 void o2hb_exit(void)
 {
 	kfree(o2hb_db_livenodes);
+	kfree(o2hb_db_liveregions);
+	kfree(o2hb_db_quorumregions);
+	kfree(o2hb_db_failedregions);
+	debugfs_remove(o2hb_debug_failedregions);
+	debugfs_remove(o2hb_debug_quorumregions);
+	debugfs_remove(o2hb_debug_liveregions);
 	debugfs_remove(o2hb_debug_livenodes);
 	debugfs_remove(o2hb_debug_dir);
 }
@@ -1189,6 +1210,48 @@ static int o2hb_debug_init(void)
 		mlog_errno(ret);
 		goto bail;
 	}
+
+	o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+						   o2hb_debug_dir,
+						   &o2hb_db_liveregions,
+						   sizeof(*o2hb_db_liveregions),
+						   O2HB_DB_TYPE_LIVEREGIONS,
+						   sizeof(o2hb_live_region_bitmap),
+						   O2NM_MAX_REGIONS,
+						   o2hb_live_region_bitmap);
+	if (!o2hb_debug_liveregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_quorumregions =
+			o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_quorumregions,
+					  sizeof(*o2hb_db_quorumregions),
+					  O2HB_DB_TYPE_QUORUMREGIONS,
+					  sizeof(o2hb_quorum_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_quorum_region_bitmap);
+	if (!o2hb_debug_quorumregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_failedregions =
+			o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_failedregions,
+					  sizeof(*o2hb_db_failedregions),
+					  O2HB_DB_TYPE_FAILEDREGIONS,
+					  sizeof(o2hb_failed_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_failed_region_bitmap);
+	if (!o2hb_debug_failedregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
 	ret = 0;
 bail:
 	if (ret)
-- 
cgit v1.2.3


From 1f28530537f106f83e5cf7ef0193075667b6d520 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:12 -0700
Subject: ocfs2/cluster: Create debugfs dir/files for each region

This patch creates debugfs directory for each o2hb region and creates
files to expose the region number and the per region live node bitmap.
This information will be useful in debugging cluster issues.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 77 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b06b9e52fba8..f28de4b09c6b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -79,6 +79,8 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_LIVEREGIONS	1
 #define O2HB_DB_TYPE_QUORUMREGIONS	2
 #define O2HB_DB_TYPE_FAILEDREGIONS	3
+#define O2HB_DB_TYPE_REGION_LIVENODES	4
+#define O2HB_DB_TYPE_REGION_NUMBER	5
 struct o2hb_debug_buf {
 	int db_type;
 	int db_size;
@@ -96,6 +98,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_LIVEREGIONS		"live_regions"
 #define O2HB_DEBUG_QUORUMREGIONS	"quorum_regions"
 #define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER	"num"
 
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -203,6 +206,12 @@ struct o2hb_region {
 	unsigned long		hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned int		hr_region_num;
 
+	struct dentry		*hr_debug_dir;
+	struct dentry		*hr_debug_livenodes;
+	struct dentry		*hr_debug_regnum;
+	struct o2hb_debug_buf	*hr_db_livenodes;
+	struct o2hb_debug_buf	*hr_db_regnum;
+
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
 	 * a more complete api that doesn't lead to this sort of fragility. */
@@ -1083,6 +1092,7 @@ static int o2hb_thread(void *data)
 static int o2hb_debug_open(struct inode *inode, struct file *file)
 {
 	struct o2hb_debug_buf *db = inode->i_private;
+	struct o2hb_region *reg;
 	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	char *buf = NULL;
 	int i = -1;
@@ -1105,6 +1115,19 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
 		spin_unlock(&o2hb_live_lock);
 		break;
 
+	case O2HB_DB_TYPE_REGION_LIVENODES:
+		spin_lock(&o2hb_live_lock);
+		reg = (struct o2hb_region *)db->db_data;
+		memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_NUMBER:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+				reg->hr_region_num);
+		goto done;
+
 	default:
 		goto done;
 	}
@@ -1342,6 +1365,12 @@ static void o2hb_region_release(struct config_item *item)
 	if (reg->hr_slots)
 		kfree(reg->hr_slots);
 
+	kfree(reg->hr_db_regnum);
+	kfree(reg->hr_db_livenodes);
+	debugfs_remove(reg->hr_debug_livenodes);
+	debugfs_remove(reg->hr_debug_regnum);
+	debugfs_remove(reg->hr_debug_dir);
+
 	spin_lock(&o2hb_live_lock);
 	list_del(&reg->hr_all_item);
 	spin_unlock(&o2hb_live_lock);
@@ -1856,10 +1885,52 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
 		: NULL;
 }
 
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+	int ret = -ENOMEM;
+
+	reg->hr_debug_dir =
+		debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+	if (!reg->hr_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_livenodes =
+			o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_livenodes),
+					  sizeof(*(reg->hr_db_livenodes)),
+					  O2HB_DB_TYPE_REGION_LIVENODES,
+					  sizeof(reg->hr_live_node_bitmap),
+					  O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_regnum =
+			o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_regnum),
+					  sizeof(*(reg->hr_db_regnum)),
+					  O2HB_DB_TYPE_REGION_NUMBER,
+					  0, O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_regnum) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	return ret;
+}
+
 static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
 							  const char *name)
 {
 	struct o2hb_region *reg = NULL;
+	int ret;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
 	if (reg == NULL)
@@ -1884,6 +1955,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
 
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
+	ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+	if (ret) {
+		config_item_put(&reg->hr_item);
+		return ERR_PTR(ret);
+	}
+
 	return &reg->hr_item;
 }
 
-- 
cgit v1.2.3


From d6aa1c7c9e4b48081c2302e14b0f857017461efd Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 18:50:50 -0700
Subject: ocfs2/cluster: Add mlogs for heartbeat up/down events

This patch adds mlogs for o2hb up and down events.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f28de4b09c6b..e8676accf902 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -797,6 +797,8 @@ fire_callbacks:
 
 		/* first on the list generates a callback */
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+			     "bitmap\n", slot->ds_node_num);
 			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
 
 			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -845,6 +847,8 @@ fire_callbacks:
 		/* last off the live_slot generates a callback */
 		list_del_init(&slot->ds_live_item);
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+			     "nodes bitmap\n", slot->ds_node_num);
 			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
 
 			/* node can be null */
-- 
cgit v1.2.3


From 43695d095dfaf266a8a940d9b07eed7f46076b49 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 6 Oct 2010 17:55:09 -0700
Subject: ocfs2/cluster: Show per region heartbeat elapsed time

This patch adds a per region debugfs file that shows the elapsed time
since the time the o2hb timer was last armed.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index e8676accf902..29aee2128edb 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -81,6 +81,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_FAILEDREGIONS	3
 #define O2HB_DB_TYPE_REGION_LIVENODES	4
 #define O2HB_DB_TYPE_REGION_NUMBER	5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME	6
 struct o2hb_debug_buf {
 	int db_type;
 	int db_size;
@@ -99,6 +100,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_QUORUMREGIONS	"quorum_regions"
 #define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER	"num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME	"elapsed_time_in_ms"
 
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -209,8 +211,10 @@ struct o2hb_region {
 	struct dentry		*hr_debug_dir;
 	struct dentry		*hr_debug_livenodes;
 	struct dentry		*hr_debug_regnum;
+	struct dentry		*hr_debug_elapsed_time;
 	struct o2hb_debug_buf	*hr_db_livenodes;
 	struct o2hb_debug_buf	*hr_db_regnum;
+	struct o2hb_debug_buf	*hr_db_elapsed_time;
 
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
@@ -1132,6 +1136,13 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
 				reg->hr_region_num);
 		goto done;
 
+	case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+				jiffies_to_msecs(jiffies -
+						 reg->hr_last_timeout_start));
+		goto done;
+
 	default:
 		goto done;
 	}
@@ -1925,6 +1936,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
 		goto bail;
 	}
 
+	reg->hr_debug_elapsed_time =
+			o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_elapsed_time),
+					  sizeof(*(reg->hr_db_elapsed_time)),
+					  O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+					  0, 0, reg);
+	if (!reg->hr_debug_elapsed_time) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
 	ret = 0;
 bail:
 	return ret;
-- 
cgit v1.2.3


From 9d002df492b14c690425d9785530371b6c1ccbca Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Oct 2010 19:51:11 -0400
Subject: cifs: add routines to build sessions and tcons on the fly

This patch is rather large, but it's a bit difficult to do piecemeal...

For non-multiuser mounts, everything will basically work as it does
today. A call to cifs_sb_tlink will return the "master" tcon link.

Turn the tcon pointer in the cifs_sb into a radix tree that uses the
fsuid of the process as a key. The value is a new "tcon_link" struct
that contains info about a tcon that's under construction.

When a new process needs a tcon, it'll call cifs_sb_tcon. That will
then look up the tcon_link in the radix tree. If it exists and is
valid, it's returned.

If it doesn't exist, then we stuff a new tcon_link into the tree and
mark it as pending and then go and try to build the session/tcon.
If that works, the tcon pointer in the tcon_link is updated and the
pending flag is cleared.

If the construction fails, then we set the tcon pointer to an ERR_PTR
and clear the pending flag.

If the radix tree is searched and the tcon_link is marked pending
then we go to sleep and wait for the pending flag to be cleared.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |   7 +-
 fs/cifs/cifsglob.h   |  32 +++---
 fs/cifs/connect.c    | 268 +++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 279 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index e04e6923d354..5ce57bdf1865 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
  *   the GNU Lesser General Public License for more details.
  *
  */
+#include <linux/radix-tree.h>
+
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
 
@@ -40,8 +42,9 @@
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 
 struct cifs_sb_info {
-	struct cifsTconInfo *ptcon;	/* primary mount */
-	struct list_head nested_tcon_q;
+	struct radix_tree_root tlink_tree;
+#define CIFS_TLINK_MASTER_TAG		0	/* is "master" (mount) tcon */
+	spinlock_t tlink_tree_lock;
 	struct nls_table *local_nls;
 	unsigned int rsize;
 	unsigned int wsize;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d5324853203b..9a7c472a153f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -317,42 +317,36 @@ struct cifsTconInfo {
  * "get" on the container.
  */
 struct tcon_link {
-	spinlock_t		tl_lock;
-	u32			tl_count;
-	u64			tl_time;
+	unsigned long		tl_index;
+	unsigned long		tl_flags;
+#define TCON_LINK_MASTER	0
+#define TCON_LINK_PENDING	1
+#define TCON_LINK_IN_TREE	2
+	unsigned long		tl_time;
+	atomic_t		tl_count;
 	struct cifsTconInfo	*tl_tcon;
 };
 
-static inline struct tcon_link *
-cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
-{
-	return (struct tcon_link *)cifs_sb->ptcon;
-}
+extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
 
 static inline struct cifsTconInfo *
 tlink_tcon(struct tcon_link *tlink)
 {
-	return (struct cifsTconInfo *)tlink;
+	return tlink->tl_tcon;
 }
 
-static inline void
-cifs_put_tlink(struct tcon_link *tlink)
-{
-	return;
-}
+extern void cifs_put_tlink(struct tcon_link *tlink);
 
 static inline struct tcon_link *
 cifs_get_tlink(struct tcon_link *tlink)
 {
+	if (tlink && !IS_ERR(tlink))
+		atomic_inc(&tlink->tl_count);
 	return tlink;
 }
 
 /* This function is always expected to succeed */
-static inline struct cifsTconInfo *
-cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
-{
-	return cifs_sb->ptcon;
-}
+extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
 
 /*
  * This info hangs off the cifsFileInfo structure, pointed to by llist.
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f6a3091c2874..3156a9de947d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -109,6 +109,9 @@ struct smb_vol {
 	struct nls_table *local_nls;
 };
 
+#define TLINK_ERROR_EXPIRE	(1 * HZ)
+
+
 static int ipv4_connect(struct TCP_Server_Info *server);
 static int ipv6_connect(struct TCP_Server_Info *server);
 
@@ -1959,6 +1962,23 @@ out_fail:
 	return ERR_PTR(rc);
 }
 
+void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+	if (!tlink || IS_ERR(tlink))
+		return;
+
+	if (!atomic_dec_and_test(&tlink->tl_count) ||
+	    test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
+		tlink->tl_time = jiffies;
+		return;
+	}
+
+	if (!IS_ERR(tlink_tcon(tlink)))
+		cifs_put_tcon(tlink_tcon(tlink));
+	kfree(tlink);
+	return;
+}
 
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -2641,6 +2661,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct TCP_Server_Info *srvTcp;
 	char   *full_path;
 	char *mount_data = mount_data_global;
+	struct tcon_link *tlink;
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals = 0;
@@ -2652,6 +2673,7 @@ try_mount_again:
 	pSesInfo = NULL;
 	srvTcp = NULL;
 	full_path = NULL;
+	tlink = NULL;
 
 	xid = GetXid();
 
@@ -2727,8 +2749,6 @@ try_mount_again:
 		goto remote_path_check;
 	}
 
-	cifs_sb->ptcon = tcon;
-
 	/* do not care if following two calls succeed - informational */
 	if (!tcon->ipc) {
 		CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2837,6 +2857,35 @@ remote_path_check:
 #endif
 	}
 
+	if (rc)
+		goto mount_fail_check;
+
+	/* now, hang the tcon off of the superblock */
+	tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
+	if (tlink == NULL) {
+		rc = -ENOMEM;
+		goto mount_fail_check;
+	}
+
+	tlink->tl_index = pSesInfo->linux_uid;
+	tlink->tl_tcon = tcon;
+	tlink->tl_time = jiffies;
+	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+	rc = radix_tree_preload(GFP_KERNEL);
+	if (rc == -ENOMEM) {
+		kfree(tlink);
+		goto mount_fail_check;
+	}
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
+	radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
+			   CIFS_TLINK_MASTER_TAG);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+	radix_tree_preload_end();
+
 mount_fail_check:
 	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
@@ -3023,19 +3072,37 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-	int rc = 0;
+	int i, ret;
 	char *tmp;
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct tcon_link *tlink[8];
+	unsigned long index = 0;
+
+	do {
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+					     (void **)tlink, index,
+					     ARRAY_SIZE(tlink));
+		/* increment index for next pass */
+		if (ret > 0)
+			index = tlink[ret - 1]->tl_index + 1;
+		for (i = 0; i < ret; i++) {
+			cifs_get_tlink(tlink[i]);
+			clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
+			radix_tree_delete(&cifs_sb->tlink_tree,
+							tlink[i]->tl_index);
+		}
+		spin_unlock(&cifs_sb->tlink_tree_lock);
 
-	cifs_put_tcon(tcon);
+		for (i = 0; i < ret; i++)
+			cifs_put_tlink(tlink[i]);
+	} while (ret != 0);
 
-	cifs_sb->ptcon = NULL;
 	tmp = cifs_sb->prepath;
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
 
-	return rc;
+	return 0;
 }
 
 int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -3096,3 +3163,190 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 	return rc;
 }
 
+struct cifsTconInfo *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+{
+	struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifsSesInfo *ses;
+	struct cifsTconInfo *tcon = NULL;
+	struct smb_vol *vol_info;
+	char username[MAX_USERNAME_SIZE + 1];
+
+	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
+	if (vol_info == NULL) {
+		tcon = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
+	vol_info->username = username;
+	vol_info->local_nls = cifs_sb->local_nls;
+	vol_info->linux_uid = fsuid;
+	vol_info->cred_uid = fsuid;
+	vol_info->UNC = master_tcon->treeName;
+	vol_info->retry = master_tcon->retry;
+	vol_info->nocase = master_tcon->nocase;
+	vol_info->local_lease = master_tcon->local_lease;
+	vol_info->no_linux_ext = !master_tcon->unix_ext;
+
+	/* FIXME: allow for other secFlg settings */
+	vol_info->secFlg = CIFSSEC_MUST_KRB5;
+
+	/* get a reference for the same TCP session */
+	write_lock(&cifs_tcp_ses_lock);
+	++master_tcon->ses->server->srv_count;
+	write_unlock(&cifs_tcp_ses_lock);
+
+	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
+	if (IS_ERR(ses)) {
+		tcon = (struct cifsTconInfo *)ses;
+		cifs_put_tcp_session(master_tcon->ses->server);
+		goto out;
+	}
+
+	tcon = cifs_get_tcon(ses, vol_info);
+	if (IS_ERR(tcon)) {
+		cifs_put_smb_ses(ses);
+		goto out;
+	}
+
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+out:
+	kfree(vol_info);
+
+	return tcon;
+}
+
+static struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+	struct tcon_link *tlink;
+	unsigned int ret;
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
+					0, 1, CIFS_TLINK_MASTER_TAG);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	/* the master tcon should always be present */
+	if (ret == 0)
+		BUG();
+
+	return tlink;
+}
+
+struct cifsTconInfo *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
+}
+
+static int
+cifs_sb_tcon_pending_wait(void *unused)
+{
+	schedule();
+	return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/*
+ * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
+ * current task.
+ *
+ * If the superblock doesn't refer to a multiuser mount, then just return
+ * the master tcon for the mount.
+ *
+ * First, search the radix tree for an existing tcon for this fsuid. If one
+ * exists, then check to see if it's pending construction. If it is then wait
+ * for construction to complete. Once it's no longer pending, check to see if
+ * it failed and either return an error or retry construction, depending on
+ * the timeout.
+ *
+ * If one doesn't exist then insert a new tcon_link struct into the tree and
+ * try to construct a new one.
+ */
+struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+	int ret;
+	unsigned long fsuid = (unsigned long) current_fsuid();
+	struct tcon_link *tlink, *newtlink;
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+	if (tlink)
+		cifs_get_tlink(tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	if (tlink == NULL) {
+		newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+		if (newtlink == NULL)
+			return ERR_PTR(-ENOMEM);
+		newtlink->tl_index = fsuid;
+		newtlink->tl_tcon = ERR_PTR(-EACCES);
+		set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
+		set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
+		cifs_get_tlink(newtlink);
+
+		ret = radix_tree_preload(GFP_KERNEL);
+		if (ret != 0) {
+			kfree(newtlink);
+			return ERR_PTR(ret);
+		}
+
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		/* was one inserted after previous search? */
+		tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+		if (tlink) {
+			cifs_get_tlink(tlink);
+			spin_unlock(&cifs_sb->tlink_tree_lock);
+			radix_tree_preload_end();
+			kfree(newtlink);
+			goto wait_for_construction;
+		}
+		ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		radix_tree_preload_end();
+		if (ret) {
+			kfree(newtlink);
+			return ERR_PTR(ret);
+		}
+		tlink = newtlink;
+	} else {
+wait_for_construction:
+		ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+				  cifs_sb_tcon_pending_wait,
+				  TASK_INTERRUPTIBLE);
+		if (ret) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(ret);
+		}
+
+		/* if it's good, return it */
+		if (!IS_ERR(tlink->tl_tcon))
+			return tlink;
+
+		/* return error if we tried this already recently */
+		if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(-EACCES);
+		}
+
+		if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
+			goto wait_for_construction;
+	}
+
+	tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
+	clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
+	wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
+
+	if (IS_ERR(tlink->tl_tcon)) {
+		cifs_put_tlink(tlink);
+		return ERR_PTR(-EACCES);
+	}
+
+	return tlink;
+}
-- 
cgit v1.2.3


From 0eb8a132c449c755b7a3f18f33365b2040c47347 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Oct 2010 19:51:12 -0400
Subject: cifs: add "multiuser" mount option

This allows someone to declare a mount as a multiuser mount.

Multiuser mounts also imply "noperm" since we want to allow the server
to handle permission checking. It also (for now) requires Kerberos
authentication. Eventually, we could expand this to other authtypes, but
that requires a scheme to allow per-user credential stashing in some
form.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3156a9de947d..e65f72d1f23b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -100,6 +100,7 @@ struct smb_vol {
 	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	bool fsc:1;	/* enable fscache */
 	bool mfsymlinks:1; /* use Minshall+French Symlinks */
+	bool multiuser:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
@@ -1347,6 +1348,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->fsc = true;
 		} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
 			vol->mfsymlinks = true;
+		} else if (strnicmp(data, "multiuser", 8) == 0) {
+			vol->multiuser = true;
 		} else
 			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
 						data);
@@ -1378,6 +1381,13 @@ cifs_parse_mount_options(char *options, const char *devname,
 			return 1;
 		}
 	}
+
+	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
+		cERROR(1, "Multiuser mounts currently require krb5 "
+			  "authentication!");
+		return 1;
+	}
+
 	if (vol->UNCip == NULL)
 		vol->UNCip = &vol->UNC[2];
 
@@ -2563,6 +2573,9 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
 	if (pvolume_info->fsc)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
+	if (pvolume_info->multiuser)
+		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
+					    CIFS_MOUNT_NO_PERM);
 	if (pvolume_info->direct_io) {
 		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-- 
cgit v1.2.3


From 13cd4b7f7472eea7cbc1ab34e042842fbb902160 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 7 Oct 2010 18:46:32 +0000
Subject: [CIFS] Various small checkpatch cleanups

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.h |  2 +-
 fs/cifs/cifsfs.h     |  2 +-
 fs/cifs/cifsglob.h   | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0c..8942b28cf807 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
 extern int traceSMB;		/* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
 #define CIFS_INFO	0x01
-#define CIFS_RC  	0x02
+#define CIFS_RC		0x02
 #define CIFS_TIMER	0x04
 
 /*
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d82f5fb4761e..786bdf36aebf 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -104,7 +104,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
 			const char *symname);
 extern int	cifs_removexattr(struct dentry *, const char *);
-extern int 	cifs_setxattr(struct dentry *, const char *, const void *,
+extern int	cifs_setxattr(struct dentry *, const char *, const void *,
 			size_t, int);
 extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9a7c472a153f..4f85dfdf197d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -517,16 +517,16 @@ struct oplock_q_entry {
 
 /* for pending dnotify requests */
 struct dir_notify_req {
-       struct list_head lhead;
-       __le16 Pid;
-       __le16 PidHigh;
-       __u16 Mid;
-       __u16 Tid;
-       __u16 Uid;
-       __u16 netfid;
-       __u32 filter; /* CompletionFilter (for multishot) */
-       int multishot;
-       struct file *pfile;
+	struct list_head lhead;
+	__le16 Pid;
+	__le16 PidHigh;
+	__u16 Mid;
+	__u16 Tid;
+	__u16 Uid;
+	__u16 netfid;
+	__u32 filter; /* CompletionFilter (for multishot) */
+	int multishot;
+	struct file *pfile;
 };
 
 struct dfs_info3_param {
-- 
cgit v1.2.3


From 955a857e062642cd3ebe1dc7bb38c0f85d8f8f17 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 29 Sep 2010 15:41:49 -0400
Subject: NFS: new idmapper

This patch creates a new idmapper system that uses the request-key function to
place a call into userspace to map user and group ids to names.  The old
idmapper was single threaded, which prevented more than one request from running
at a single time.  This means that a user would have to wait for an upcall to
finish before accessing a cached result.

The upcall result is stored on a keyring of type id_resolver.  See the file
Documentation/filesystems/nfs/idmapper.txt for instructions.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
[Trond: fix up the return value of nfs_idmap_lookup_name and clean up code]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/00-INDEX     |   2 +
 Documentation/filesystems/nfs/idmapper.txt |  67 +++++++++
 fs/nfs/Kconfig                             |  11 ++
 fs/nfs/idmap.c                             | 211 ++++++++++++++++++++++++++++-
 fs/nfs/inode.c                             |   7 +
 fs/nfs/nfs4xdr.c                           |   4 +-
 fs/nfs/sysctl.c                            |   2 +
 include/linux/nfs_idmap.h                  |  31 ++++-
 8 files changed, 329 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/filesystems/nfs/idmapper.txt

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 2f68cd688769..3225a5662114 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -14,3 +14,5 @@ nfsroot.txt
 	- short guide on setting up a diskless box with NFS root filesystem.
 rpc-cache.txt
 	- introduction to the caching mechanisms in the sunrpc layer.
+idmapper.txt
+	- information for configuring request-keys to be used by idmapper
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
new file mode 100644
index 000000000000..c3852041a21f
--- /dev/null
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -0,0 +1,67 @@
+
+=========
+ID Mapper
+=========
+Id mapper is used by NFS to translate user and group ids into names, and to
+translate user and group names into ids.  Part of this translation involves
+performing an upcall to userspace to request the information.  Id mapper will
+user request-key to perform this upcall and cache the result.  The program
+/usr/sbin/nfs.upcall should be called by request-key, and will perform the
+translation and initialize a key with the resulting information.
+
+ NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this
+ feature.
+
+===========
+Configuring
+===========
+The file /etc/request-key.conf will need to be modified so /sbin/request-key can
+direct the upcall.  The following line should be added:
+
+#OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
+#======	=======	===============	===============	===============================
+create	id_resolver	*	*		/usr/sbin/nfs.upcall %k %d 600
+
+This will direct all id_resolver requests to the program /usr/sbin/nfs.upcall.
+The last parameter, 600, defines how many seconds into the future the key will
+expire.  This parameter is optional for /usr/sbin/nfs.upcall.  When the timeout
+is not specified, nfs.upcall will default to 600 seconds.
+
+id mapper uses for key descriptions:
+	  uid:  Find the UID for the given user
+	  gid:  Find the GID for the given group
+	 user:  Find the user  name for the given UID
+	group:  Find the group name for the given GID
+
+You can handle any of these individually, rather than using the generic upcall
+program.  If you would like to use your own program for a uid lookup then you
+would edit your request-key.conf so it look similar to this:
+
+#OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
+#======	=======	===============	===============	===============================
+create	id_resolver	uid:*	*		/some/other/program  %k %d 600
+create	id_resolver	*	*		/usr/sbin/nfs.upcall %k %d 600
+
+Notice that the new line was added above the line for the generic program.
+request-key will find the first matching line and corresponding program.  In
+this case, /some/other/program will handle all uid lookups and
+/usr/sbin/nfs.upcall will handle gid, user, and group lookups.
+
+See <file:Documentation/keys-request-keys.txt> for more information about the
+request-key function.
+
+
+==========
+nfs.upcall
+==========
+nfs.upcall is designed to be called by request-key, and should not be run "by
+hand".  This program takes two arguments, a serialized key and a key
+description.  The serialized key is first converted into a key_serial_t, and
+then passed as an argument to keyctl_instantiate (both are part of keyutils.h).
+
+The actual lookups are performed by functions found in nfsidmap.h.  nfs.upcall
+determines the correct function to call by looking at the first part of the
+description string.  For example, a uid lookup description will appear as
+"uid:user@domain".
+
+nfs.upcall will return 0 if the key was instantiated, and non-zero otherwise.
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 6c2aad49d731..3f69752d6f18 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -116,3 +116,14 @@ config NFS_USE_KERNEL_DNS
 	select DNS_RESOLVER
 	select KEYS
 	default y
+
+config NFS_USE_NEW_IDMAPPER
+	bool "Use the new idmapper upcall routine"
+	depends on NFS_V4 && KEYS
+	help
+	  Say Y here if you want NFS to use the new idmapper upcall functions.
+	  You will need /sbin/request-key (usually provided by the keyutils
+	  package).  For details, read
+	  <file:Documentation/filesystems/nfs/idmapper.txt>.
+
+	  If you are unsure, say N.
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..dec47ed8b6b9 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+
+#include <keys/user-type.h>
+
+#define NFS_UINT_MAXLEN 11
+
+const struct cred *id_resolver_cache;
+
+struct key_type key_type_id_resolver = {
+	.name		= "id_resolver",
+	.instantiate	= user_instantiate,
+	.match		= user_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+int nfs_idmap_init(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret = 0;
+
+	printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+			     (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			     KEY_USR_VIEW | KEY_USR_READ,
+			     KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_id_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	id_resolver_cache = cred;
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+	key_revoke(id_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_id_resolver);
+	put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+				const char *type, size_t typelen, char **desc)
+{
+	char *cp;
+	size_t desclen = typelen + namelen + 2;
+
+	*desc = kmalloc(desclen, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	cp = *desc;
+	memcpy(cp, type, typelen);
+	cp += typelen;
+	*cp++ = ':';
+
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+	return desclen;
+}
+
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+		const char *type, void *data, size_t data_size)
+{
+	const struct cred *saved_cred;
+	struct key *rkey;
+	char *desc;
+	struct user_key_payload *payload;
+	ssize_t ret;
+
+	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+	if (ret <= 0)
+		goto out;
+
+	saved_cred = override_creds(id_resolver_cache);
+	rkey = request_key(&key_type_id_resolver, desc, "");
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	rcu_read_lock();
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto out_up;
+
+	payload = rcu_dereference(rkey->payload.data);
+	if (IS_ERR_OR_NULL(payload)) {
+		ret = PTR_ERR(payload);
+		goto out_up;
+	}
+
+	ret = payload->datalen;
+	if (ret > 0 && ret <= data_size)
+		memcpy(data, payload->data, ret);
+	else
+		ret = -EINVAL;
+
+out_up:
+	rcu_read_unlock();
+	key_put(rkey);
+out:
+	return ret;
+}
+
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	int id_len;
+	ssize_t ret;
+
+	id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+	ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+	if (ret < 0)
+		return -EINVAL;
+	return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+				const char *type, __u32 *id)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	long id_long;
+	ssize_t data_size;
+	int ret = 0;
+
+	data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+	if (data_size <= 0) {
+		ret = -EINVAL;
+	} else {
+		ret = strict_strtol(id_str, 10, &id_long);
+		*id = (__u32)id_long;
+	}
+	return ret;
+}
+
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+{
+	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+{
+	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+{
+	return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+}
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+{
+	return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+}
+
+#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 }
 
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 18be041abd23..f2d2c801e0af 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1526,6 +1526,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfs_idmap_init();
+	if (err < 0)
+		goto out9;
+
 	err = nfs_dns_resolver_init();
 	if (err < 0)
 		goto out8;
@@ -1590,6 +1594,8 @@ out6:
 out7:
 	nfs_dns_resolver_destroy();
 out8:
+	nfs_idmap_quit();
+out9:
 	return err;
 }
 
@@ -1602,6 +1608,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
 	nfs_dns_resolver_destroy();
+	nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3feace66b981..6ea5c9392fe4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -816,7 +816,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_MODE)
 		len += 4;
 	if (iap->ia_valid & ATTR_UID) {
-		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
+		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
 		if (owner_namelen < 0) {
 			dprintk("nfs: couldn't resolve uid %d to string\n",
 					iap->ia_uid);
@@ -828,7 +828,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 	}
 	if (iap->ia_valid & ATTR_GID) {
-		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
+		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
 		if (owner_grouplen < 0) {
 			dprintk("nfs: couldn't resolve gid %d to string\n",
 					iap->ia_gid);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
 	{
 		.procname = "idmap_cache_timeout",
 		.data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.mode = 0644,
 		.proc_handler = proc_dointvec_jiffies,
 	},
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
 	{
 		.procname	= "nfs_mountpoint_timeout",
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index 91a1c24e0cbf..e8352dc5afb5 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -66,13 +66,40 @@ struct idmap_msg {
 /* Forward declaration to make this header independent of others */
 struct nfs_client;
 
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+
+int nfs_idmap_init(void);
+void nfs_idmap_quit(void);
+
+static inline int nfs_idmap_new(struct nfs_client *clp)
+{
+	return 0;
+}
+
+static inline void nfs_idmap_delete(struct nfs_client *clp)
+{
+}
+
+#else /* CONFIG_NFS_USE_NEW_IDMAPPER not set */
+
+static inline int nfs_idmap_init(void)
+{
+	return 0;
+}
+
+static inline void nfs_idmap_quit(void)
+{
+}
+
 int nfs_idmap_new(struct nfs_client *);
 void nfs_idmap_delete(struct nfs_client *);
 
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
+
 int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *);
 int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *);
-int nfs_map_uid_to_name(struct nfs_client *, __u32, char *);
-int nfs_map_gid_to_group(struct nfs_client *, __u32, char *);
+int nfs_map_uid_to_name(struct nfs_client *, __u32, char *, size_t);
+int nfs_map_gid_to_group(struct nfs_client *, __u32, char *, size_t);
 
 extern unsigned int nfs_idmap_cache_timeout;
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 3aa1c8c2900065a51268430ab48a1b42fdfe5b45 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 7 Oct 2010 14:46:28 -0400
Subject: cifs: on multiuser mount, set ownership to
 current_fsuid/current_fsgid (try #5)

...when unix extensions aren't enabled. This makes everything on the
mount appear to be owned by the current user.

This version of the patch differs from previous versions however in that
the admin can still force the ownership of all files to appear as a
single user via the uid=/gid= options.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index df29a3a3d80c..7e5b8c39a07a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1755,11 +1755,21 @@ check_inval:
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct kstat *stat)
 {
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 	int err = cifs_revalidate_dentry(dentry);
+
 	if (!err) {
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blksize = CIFS_MAX_MSGSIZE;
 		stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+		    !tcon->unix_ext) {
+			if (!cifs_sb->mnt_uid)
+				stat->uid = current_fsuid();
+			if (!cifs_sb->mnt_uid)
+				stat->gid = current_fsgid();
+		}
 	}
 	return err;
 }
-- 
cgit v1.2.3


From 2de970ff69bbcc5a4b7440df669a595b2b1acd73 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Oct 2010 19:51:12 -0400
Subject: cifs: implement recurring workqueue job to prune old tcons

Create a workqueue job that cleans out unused tlinks. For now, it uses
a hardcoded expire time of 10 minutes. When it's done, the work rearms
itself. On umount, the work is cancelled before tearing down the tlink
tree.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |  1 +
 fs/cifs/connect.c    | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 5ce57bdf1865..586ee3d527d2 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -59,5 +59,6 @@ struct cifs_sb_info {
 	char   *mountdata; /* mount options received at mount time */
 #endif
 	struct backing_dev_info bdi;
+	struct delayed_work prune_tlinks;
 };
 #endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e65f72d1f23b..1092e9e839c2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -110,11 +110,13 @@ struct smb_vol {
 	struct nls_table *local_nls;
 };
 
+/* FIXME: should these be tunable? */
 #define TLINK_ERROR_EXPIRE	(1 * HZ)
-
+#define TLINK_IDLE_EXPIRE	(600 * HZ)
 
 static int ipv4_connect(struct TCP_Server_Info *server);
 static int ipv6_connect(struct TCP_Server_Info *server);
+static void cifs_prune_tlinks(struct work_struct *work);
 
 /*
  * cifs tcp session reconnection
@@ -2494,6 +2496,8 @@ convert_delimiter(char *path, char delim)
 static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			  struct cifs_sb_info *cifs_sb)
 {
+	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+
 	if (pvolume_info->rsize > CIFSMaxBufSize) {
 		cERROR(1, "rsize %d too large, using MaxBufSize",
 			pvolume_info->rsize);
@@ -2899,6 +2903,9 @@ remote_path_check:
 	spin_unlock(&cifs_sb->tlink_tree_lock);
 	radix_tree_preload_end();
 
+	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+
 mount_fail_check:
 	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
@@ -3090,6 +3097,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	struct tcon_link *tlink[8];
 	unsigned long index = 0;
 
+	cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+
 	do {
 		spin_lock(&cifs_sb->tlink_tree_lock);
 		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
@@ -3363,3 +3372,50 @@ wait_for_construction:
 
 	return tlink;
 }
+
+/*
+ * periodic workqueue job that scans tcon_tree for a superblock and closes
+ * out tcons.
+ */
+static void
+cifs_prune_tlinks(struct work_struct *work)
+{
+	struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
+						    prune_tlinks.work);
+	struct tcon_link *tlink[8];
+	unsigned long now = jiffies;
+	unsigned long index = 0;
+	int i, ret;
+
+	do {
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+					     (void **)tlink, index,
+					     ARRAY_SIZE(tlink));
+		/* increment index for next pass */
+		if (ret > 0)
+			index = tlink[ret - 1]->tl_index + 1;
+		for (i = 0; i < ret; i++) {
+			if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
+			    atomic_read(&tlink[i]->tl_count) != 0 ||
+			    time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
+				       now)) {
+				tlink[i] = NULL;
+				continue;
+			}
+			cifs_get_tlink(tlink[i]);
+			clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
+			radix_tree_delete(&cifs_sb->tlink_tree,
+					  tlink[i]->tl_index);
+		}
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+
+		for (i = 0; i < ret; i++) {
+			if (tlink[i] != NULL)
+				cifs_put_tlink(tlink[i]);
+		}
+	} while (ret != 0);
+
+	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+}
-- 
cgit v1.2.3


From ccc46a7402200a3b28a8fa1605ea5405a9ef66f7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Oct 2010 19:51:12 -0400
Subject: cifs: fix module refcount leak in find_domain_name

find_domain_name() uses load_nls_default which takes a module reference
on the appropriate NLS module, but doesn't put it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 730038a12982..89fb94fac4b5 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -328,13 +328,16 @@ find_domain_name(struct cifsSesInfo *ses)
 			if (!attrsize)
 				break;
 			if (!ses->domainName) {
+				struct nls_table *default_nls;
 				ses->domainName =
 					kmalloc(attrsize + 1, GFP_KERNEL);
 				if (!ses->domainName)
 						return -ENOMEM;
+				default_nls = load_nls_default();
 				cifs_from_ucs2(ses->domainName,
 					(__le16 *)blobptr, attrsize, attrsize,
-					load_nls_default(), false);
+					default_nls, false);
+				unload_nls(default_nls);
 				break;
 			}
 		}
-- 
cgit v1.2.3


From d2445556137c38ae15d3191174bfd235630ed7cd Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 8 Oct 2010 03:38:46 +0000
Subject: [CIFS] Remove build warning

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1092e9e839c2..4944fc84d5ef 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3185,7 +3185,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 	return rc;
 }
 
-struct cifsTconInfo *
+static struct cifsTconInfo *
 cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 {
 	struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
-- 
cgit v1.2.3


From b8aeec34175fc8fe8b0d40efea4846dfc1ba663e Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 7 Oct 2010 15:31:31 +0900
Subject: HWPOISON/signalfd: add support for addr_lsb

Similar change as to signal delivery: copy out the si_addr_lsb field
to user space in signalfd

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/signalfd.c            | 10 ++++++++++
 include/linux/signalfd.h |  3 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..bdd4496ae67f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
 #ifdef __ARCH_SI_TRAPNO
 		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
+#endif
+#ifdef BUS_MCEERR_AO
+		/* 
+		 * Other callers might not initialize the si_lsb field,
+		 * so check explicitly for the right codes here.
+		 */
+		if (kinfo->si_code == BUS_MCEERR_AR ||
+		    kinfo->si_code == BUS_MCEERR_AO)
+			err |= __put_user((short) kinfo->si_addr_lsb,
+					  &uinfo->ssi_addr_lsb);
 #endif
 		break;
 	case __SI_CHLD:
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index b363b916c909..3ff4961da9b5 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -33,6 +33,7 @@ struct signalfd_siginfo {
 	__u64 ssi_utime;
 	__u64 ssi_stime;
 	__u64 ssi_addr;
+	__u16 ssi_addr_lsb;
 
 	/*
 	 * Pad strcture to 128 bytes. Remember to update the
@@ -43,7 +44,7 @@ struct signalfd_siginfo {
 	 * comes out of a read(2) and we really don't want to have
 	 * a compat on read(2).
 	 */
-	__u8 __pad[48];
+	__u8 __pad[46];
 };
 
 
-- 
cgit v1.2.3


From 290408d4a25002f099efeee7b6a5778d431154d6 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 8 Sep 2010 10:19:35 +0900
Subject: hugetlb: hugepage migration core

This patch extends page migration code to support hugepage migration.
One of the potential users of this feature is soft offlining which
is triggered by memory corrected errors (added by the next patch.)

Todo:
- there are other users of page migration such as memory policy,
  memory hotplug and memocy compaction.
  They are not ready for hugepage support for now.

ChangeLog since v4:
- define migrate_huge_pages()
- remove changes on isolation/putback_lru_page()

ChangeLog since v2:
- refactor isolate/putback_lru_page() to handle hugepage
- add comment about race on unmap_and_move_huge_page()

ChangeLog since v1:
- divide migration code path for hugepage
- define routine checking migration swap entry for hugetlb
- replace "goto" with "if/else" in remove_migration_pte()

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/hugetlbfs/inode.c    |  15 ++++
 include/linux/migrate.h |  16 ++++
 mm/hugetlb.c            |  18 +++-
 mm/migrate.c            | 232 ++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 262 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..1f7ca505d48e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 
 #include <asm/uaccess.h>
 
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+				struct page *newpage, struct page *page)
+{
+	int rc;
+
+	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+	if (rc)
+		return rc;
+	migrate_page_copy(newpage, page);
+
+	return 0;
+}
+
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 	.write_begin	= hugetlbfs_write_begin,
 	.write_end	= hugetlbfs_write_end,
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
+	.migratepage    = hugetlbfs_migrate_page,
 };
 
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7238231b8dd4..3c1941e40e61 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, int offlining);
+extern int migrate_huge_pages(struct list_head *l, new_page_t x,
+			unsigned long private, int offlining);
 
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
 extern int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags);
+extern void migrate_page_copy(struct page *newpage, struct page *page);
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+				  struct page *newpage, struct page *page);
 #else
 #define PAGE_MIGRATION 0
 
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, int offlining) { return -ENOSYS; }
+static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
+		unsigned long private, int offlining) { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
 	return -ENOSYS;
 }
 
+static inline void migrate_page_copy(struct page *newpage,
+				     struct page *page) {}
+
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+				  struct page *newpage, struct page *page)
+{
+	return -ENOSYS;
+}
+
 /* Possible settings for the migrate_page() method in address_operations */
 #define migrate_page NULL
 #define fail_migrate_page NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a73dbdcb89eb..0fa9de8361bd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2217,6 +2217,19 @@ nomem:
 	return -ENOMEM;
 }
 
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+	swp_entry_t swp;
+
+	if (huge_pte_none(pte) || pte_present(pte))
+		return 0;
+	swp = pte_to_swp_entry(pte);
+	if (non_swap_entry(swp) && is_migration_entry(swp)) {
+		return 1;
+	} else
+		return 0;
+}
+
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
 	swp_entry_t swp;
@@ -2648,7 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	ptep = huge_pte_offset(mm, address);
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
-		if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+		if (unlikely(is_hugetlb_entry_migration(entry))) {
+			migration_entry_wait(mm, (pmd_t *)ptep, address);
+			return 0;
+		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..55dbc45880c6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/gfp.h>
 
 #include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 	pte_t *ptep, pte;
  	spinlock_t *ptl;
 
- 	pgd = pgd_offset(mm, addr);
-	if (!pgd_present(*pgd))
-		goto out;
+	if (unlikely(PageHuge(new))) {
+		ptep = huge_pte_offset(mm, addr);
+		if (!ptep)
+			goto out;
+		ptl = &mm->page_table_lock;
+	} else {
+		pgd = pgd_offset(mm, addr);
+		if (!pgd_present(*pgd))
+			goto out;
 
-	pud = pud_offset(pgd, addr);
-	if (!pud_present(*pud))
-		goto out;
+		pud = pud_offset(pgd, addr);
+		if (!pud_present(*pud))
+			goto out;
 
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
-		goto out;
+		pmd = pmd_offset(pud, addr);
+		if (!pmd_present(*pmd))
+			goto out;
 
-	ptep = pte_offset_map(pmd, addr);
+		ptep = pte_offset_map(pmd, addr);
 
-	if (!is_swap_pte(*ptep)) {
-		pte_unmap(ptep);
-		goto out;
- 	}
+		if (!is_swap_pte(*ptep)) {
+			pte_unmap(ptep);
+			goto out;
+		}
+
+		ptl = pte_lockptr(mm, pmd);
+	}
 
- 	ptl = pte_lockptr(mm, pmd);
  	spin_lock(ptl);
 	pte = *ptep;
 	if (!is_swap_pte(pte))
@@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 	if (is_write_migration_entry(entry))
 		pte = pte_mkwrite(pte);
+	if (PageHuge(new))
+		pte = pte_mkhuge(pte);
 	flush_cache_page(vma, addr, pte_pfn(pte));
 	set_pte_at(mm, addr, ptep, pte);
 
-	if (PageAnon(new))
+	if (PageHuge(new)) {
+		if (PageAnon(new))
+			hugepage_add_anon_rmap(new, vma, addr);
+		else
+			page_dup_rmap(new);
+	} else if (PageAnon(new))
 		page_add_anon_rmap(new, vma, addr);
 	else
 		page_add_file_rmap(new);
@@ -275,12 +291,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	return 0;
 }
 
+/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+				   struct page *newpage, struct page *page)
+{
+	int expected_count;
+	void **pslot;
+
+	if (!mapping) {
+		if (page_count(page) != 1)
+			return -EAGAIN;
+		return 0;
+	}
+
+	spin_lock_irq(&mapping->tree_lock);
+
+	pslot = radix_tree_lookup_slot(&mapping->page_tree,
+					page_index(page));
+
+	expected_count = 2 + page_has_private(page);
+	if (page_count(page) != expected_count ||
+	    (struct page *)radix_tree_deref_slot(pslot) != page) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
+	if (!page_freeze_refs(page, expected_count)) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
+	get_page(newpage);
+
+	radix_tree_replace_slot(pslot, newpage);
+
+	page_unfreeze_refs(page, expected_count);
+
+	__put_page(page);
+
+	spin_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /*
  * Copy the page to its new location
  */
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
 {
-	copy_highpage(newpage, page);
+	if (PageHuge(page))
+		copy_huge_page(newpage, page);
+	else
+		copy_highpage(newpage, page);
 
 	if (PageError(page))
 		SetPageError(newpage);
@@ -723,6 +787,92 @@ move_newpage:
 	return rc;
 }
 
+/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+				unsigned long private, struct page *hpage,
+				int force, int offlining)
+{
+	int rc = 0;
+	int *result = NULL;
+	struct page *new_hpage = get_new_page(hpage, private, &result);
+	int rcu_locked = 0;
+	struct anon_vma *anon_vma = NULL;
+
+	if (!new_hpage)
+		return -ENOMEM;
+
+	rc = -EAGAIN;
+
+	if (!trylock_page(hpage)) {
+		if (!force)
+			goto out;
+		lock_page(hpage);
+	}
+
+	if (PageAnon(hpage)) {
+		rcu_read_lock();
+		rcu_locked = 1;
+
+		if (page_mapped(hpage)) {
+			anon_vma = page_anon_vma(hpage);
+			atomic_inc(&anon_vma->external_refcount);
+		}
+	}
+
+	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+	if (!page_mapped(hpage))
+		rc = move_to_new_page(new_hpage, hpage, 1);
+
+	if (rc)
+		remove_migration_ptes(hpage, hpage);
+
+	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+					    &anon_vma->lock)) {
+		int empty = list_empty(&anon_vma->head);
+		spin_unlock(&anon_vma->lock);
+		if (empty)
+			anon_vma_free(anon_vma);
+	}
+
+	if (rcu_locked)
+		rcu_read_unlock();
+out:
+	unlock_page(hpage);
+
+	if (rc != -EAGAIN) {
+		list_del(&hpage->lru);
+		put_page(hpage);
+	}
+
+	put_page(new_hpage);
+
+	if (result) {
+		if (rc)
+			*result = rc;
+		else
+			*result = page_to_nid(new_hpage);
+	}
+	return rc;
+}
+
 /*
  * migrate_pages
  *
@@ -788,6 +938,52 @@ out:
 	return nr_failed + retry;
 }
 
+int migrate_huge_pages(struct list_head *from,
+		new_page_t get_new_page, unsigned long private, int offlining)
+{
+	int retry = 1;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int rc;
+
+	for (pass = 0; pass < 10 && retry; pass++) {
+		retry = 0;
+
+		list_for_each_entry_safe(page, page2, from, lru) {
+			cond_resched();
+
+			rc = unmap_and_move_huge_page(get_new_page,
+					private, page, pass > 2, offlining);
+
+			switch(rc) {
+			case -ENOMEM:
+				goto out;
+			case -EAGAIN:
+				retry++;
+				break;
+			case 0:
+				break;
+			default:
+				/* Permanent failure */
+				nr_failed++;
+				break;
+			}
+		}
+	}
+	rc = 0;
+out:
+
+	list_for_each_entry_safe(page, page2, from, lru)
+		put_page(page);
+
+	if (rc)
+		return rc;
+
+	return nr_failed + retry;
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Move a list of individual pages
-- 
cgit v1.2.3


From e4eab08d6050ad04d960738f589724204fd1064c Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Fri, 20 Aug 2010 21:14:46 +0100
Subject: ARM: 6342/1: fix ASLR of PIE executables

Since commits 990cb8acf2 and cc92c28b2d, it is possible to have full
address space layout randomization (ASLR) on ARM.  Except that one small
change was missing for ASLR of PIE executables.

Signed-off-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 fs/binfmt_elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a6..6884e198e0c7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) || defined(CONFIG_ARM)
 			load_bias = 0;
 #else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-- 
cgit v1.2.3


From 0dd12c21951e085357046b68c3a108273062d2aa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 8 Oct 2010 12:20:12 -0400
Subject: cifs: initialize tlink_tree_lock and tlink_tree

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c96345c3314d..3258c822328b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -98,6 +98,9 @@ cifs_read_super(struct super_block *sb, void *data,
 	if (cifs_sb == NULL)
 		return -ENOMEM;
 
+	spin_lock_init(&cifs_sb->tlink_tree_lock);
+	INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
+
 	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
 	if (rc) {
 		kfree(cifs_sb);
-- 
cgit v1.2.3


From 4d94aa1b1d437f9513ddc89974d8bd214b8304f6 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Sat, 9 Oct 2010 10:27:04 -0700
Subject: ocfs2/cluster: Bump up dlm protocol to version 1.1

dlm protocol 1.1. activates messages DLM_QUERY_REGION and DLM_QUERY_NODEINFO
that are a must for global heartbeat.

It also activates o2hb_global_heartbeat_active().

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 fs/ocfs2/dlm/dlmdomain.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 29aee2128edb..6a1280a013ea 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2429,6 +2429,6 @@ EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
 
 int o2hb_global_heartbeat_active(void)
 {
-	return 0;
+	return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
 }
 EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 78d428f5e10e..58a93b953735 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -135,7 +135,7 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  */
 static const struct dlm_protocol_version dlm_protocol = {
 	.pv_major = 1,
-	.pv_minor = 0,
+	.pv_minor = 1,
 };
 
 #define DLM_DOMAIN_BACKOFF_MS 200
-- 
cgit v1.2.3


From 6370a6ad3b53df90b4700977f7718118a2cd524a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 Oct 2010 15:12:27 +0200
Subject: workqueue: add and use WQ_MEM_RECLAIM flag

Add WQ_MEM_RECLAIM flag which currently maps to WQ_RESCUER, mark
WQ_RESCUER as internal and replace all external WQ_RESCUER usages to
WQ_MEM_RECLAIM.

This makes the API users express the intent of the workqueue instead
of indicating the internal mechanism used to guarantee forward
progress.  This is also to make it cleaner to add more semantics to
WQ_MEM_RECLAIM.  For example, if deemed necessary, memory reclaim
workqueues can be made highpri.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
---
 Documentation/workqueue.txt | 29 +++++++++++++++--------------
 drivers/ata/libata-sff.c    |  2 +-
 fs/gfs2/main.c              |  2 +-
 fs/xfs/linux-2.6/xfs_buf.c  |  2 +-
 include/linux/workqueue.h   | 11 ++++++-----
 kernel/workqueue.c          |  7 +++++++
 6 files changed, 31 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt
index e4498a2872c3..996a27d9b8db 100644
--- a/Documentation/workqueue.txt
+++ b/Documentation/workqueue.txt
@@ -196,11 +196,11 @@ resources, scheduled and executed.
 	suspend operations.  Work items on the wq are drained and no
 	new work item starts execution until thawed.
 
-  WQ_RESCUER
+  WQ_MEM_RECLAIM
 
 	All wq which might be used in the memory reclaim paths _MUST_
-	have this flag set.  This reserves one worker exclusively for
-	the execution of this wq under memory pressure.
+	have this flag set.  The wq is guaranteed to have at least one
+	execution context regardless of memory pressure.
 
   WQ_HIGHPRI
 
@@ -356,11 +356,11 @@ If q1 has WQ_CPU_INTENSIVE set,
 
 6. Guidelines
 
-* Do not forget to use WQ_RESCUER if a wq may process work items which
-  are used during memory reclaim.  Each wq with WQ_RESCUER set has one
-  rescuer thread reserved for it.  If there is dependency among
-  multiple work items used during memory reclaim, they should be
-  queued to separate wq each with WQ_RESCUER.
+* Do not forget to use WQ_MEM_RECLAIM if a wq may process work items
+  which are used during memory reclaim.  Each wq with WQ_MEM_RECLAIM
+  set has an execution context reserved for it.  If there is
+  dependency among multiple work items used during memory reclaim,
+  they should be queued to separate wq each with WQ_MEM_RECLAIM.
 
 * Unless strict ordering is required, there is no need to use ST wq.
 
@@ -368,12 +368,13 @@ If q1 has WQ_CPU_INTENSIVE set,
   recommended.  In most use cases, concurrency level usually stays
   well under the default limit.
 
-* A wq serves as a domain for forward progress guarantee (WQ_RESCUER),
-  flush and work item attributes.  Work items which are not involved
-  in memory reclaim and don't need to be flushed as a part of a group
-  of work items, and don't require any special attribute, can use one
-  of the system wq.  There is no difference in execution
-  characteristics between using a dedicated wq and a system wq.
+* A wq serves as a domain for forward progress guarantee
+  (WQ_MEM_RECLAIM, flush and work item attributes.  Work items which
+  are not involved in memory reclaim and don't need to be flushed as a
+  part of a group of work items, and don't require any special
+  attribute, can use one of the system wq.  There is no difference in
+  execution characteristics between using a dedicated wq and a system
+  wq.
 
 * Unless work items are expected to consume a huge amount of CPU
   cycles, using a bound wq is usually beneficial due to the increased
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index e30c537cce32..f5296bb19ec0 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -3335,7 +3335,7 @@ void ata_sff_port_init(struct ata_port *ap)
 
 int __init ata_sff_init(void)
 {
-	ata_sff_wq = alloc_workqueue("ata_sff", WQ_RESCUER, WQ_MAX_ACTIVE);
+	ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
 	if (!ata_sff_wq)
 		return -ENOMEM;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..1c5f46075d52 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -140,7 +140,7 @@ static int __init init_gfs2_fs(void)
 
 	error = -ENOMEM;
 	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-					  WQ_NON_REENTRANT | WQ_RESCUER, 0);
+					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!gfs_recovery_wq)
 		goto fail_wq;
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..6838aefca71f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1933,7 +1933,7 @@ xfs_buf_init(void)
 		goto out;
 
 	xfslogd_workqueue = alloc_workqueue("xfslogd",
-					WQ_RESCUER | WQ_HIGHPRI, 1);
+					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
 	if (!xfslogd_workqueue)
 		goto out_free_buf_zone;
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index e33ff4a91703..03bbe903e5ce 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -243,11 +243,12 @@ enum {
 	WQ_NON_REENTRANT	= 1 << 0, /* guarantee non-reentrance */
 	WQ_UNBOUND		= 1 << 1, /* not bound to any cpu */
 	WQ_FREEZEABLE		= 1 << 2, /* freeze during suspend */
-	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
+	WQ_MEM_RECLAIM		= 1 << 3, /* may be used for memory reclaim */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
 	WQ_DYING		= 1 << 6, /* internal: workqueue is dying */
+	WQ_RESCUER		= 1 << 7, /* internal: workqueue has rescuer */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
@@ -309,7 +310,7 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
  * @name: name of the workqueue
- * @flags: WQ_* flags (only WQ_FREEZEABLE and WQ_RESCUER are meaningful)
+ * @flags: WQ_* flags (only WQ_FREEZEABLE and WQ_MEM_RECLAIM are meaningful)
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
@@ -325,11 +326,11 @@ alloc_ordered_workqueue(const char *name, unsigned int flags)
 }
 
 #define create_workqueue(name)					\
-	alloc_workqueue((name), WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
 #define create_freezeable_workqueue(name)			\
-	alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
 #define create_singlethread_workqueue(name)			\
-	alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b57a8babdec3..2c6871cbcbee 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2847,6 +2847,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
+	/*
+	 * Workqueues which may be used during memory reclaim should
+	 * have a rescuer to guarantee forward progress.
+	 */
+	if (flags & WQ_MEM_RECLAIM)
+		flags |= WQ_RESCUER;
+
 	/*
 	 * Unbound workqueues aren't concurrency managed and should be
 	 * dispatched to workers immediately.
-- 
cgit v1.2.3


From 9b5cd10e4c14a1a642076ace6a73be3d33c91fb6 Mon Sep 17 00:00:00 2001
From: Srinivas Eeda <srinivas.eeda@oracle.com>
Date: Tue, 5 Oct 2010 15:53:06 -0700
Subject: ocfs2: validate bg_free_bits_count after update

This patch adds a safe check to ensure bg_free_bits_count doesn't exceed
bg_bits in a group descriptor. This is to avoid on disk corruption that was
seen recently.

debugfs: group <52803072>
       Group Chain: 179   Parent Inode: 11  Generation: 2959379682
       CRC32: 00000000   ECC: 0000
       ##   Block#            Total    Used     Free     Contig   Size
       0    52803072          32256    4294965350   34202    18207    4032
       ......

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/suballoc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8a286f54dca1..64f2c50a1c37 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	}
 
 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
 	while(num_bits--)
 		ocfs2_set_bit(bit_off++, bitmap);
 
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 				(unsigned long *) undo_bg->bg_bitmap);
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
 
 	if (undo_fn)
 		jbd_unlock_bh_state(group_bh);
-- 
cgit v1.2.3


From f30d44f3e54a94e037da7a71d714b585dab28d9e Mon Sep 17 00:00:00 2001
From: Poyo VL <poyo_vl@yahoo.com>
Date: Mon, 11 Oct 2010 13:45:52 -0700
Subject: When I tried to compile I got the following warning:
 fs/ocfs2/slot_map.c: In function ‘ocfs2_init_slot_info’:
 fs/ocfs2/slot_map.c:360: warning: ‘bytes’ may be used uninitialized in this
 function fs/ocfs2/slot_map.c:360: note: ‘bytes’ was declared here Compiler:
 gcc version 4.4.3 (GCC) on Mandriva I'm not sure why this warning occurs, I
 think compiler don't know that variable "bytes" is initialized when it is
 sent by reference to ocfs2_slot_map_physical_size and it throws that ugly
 warning. However, a simple initialization of "bytes" variable with 0 will fix
 it.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ionut Gabriel Popescu <poyo_vl@yahoo.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/slot_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 {
 	int status = 0;
 	u64 blkno;
-	unsigned long long blocks, bytes;
+	unsigned long long blocks, bytes = 0;
 	unsigned int i;
 	struct buffer_head *bh;
 
-- 
cgit v1.2.3


From 75d9bbc73804285020aa4d99bd2a9600edea8945 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Mon, 11 Oct 2010 12:57:09 -0500
Subject: Initialize max_slots early

Functions such as ocfs2_recovery_init() make use of osb->max_slots.
Initialize osb->max_slots early so the functions may use the correct
value.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/super.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 350e8b5a9396..b578644b6637 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2105,6 +2105,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+
 	ocfs2_orphan_scan_init(osb);
 
 	status = ocfs2_recovery_init(osb);
@@ -2143,15 +2152,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
-	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
-		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-		     osb->max_slots);
-		status = -EINVAL;
-		goto bail;
-	}
-	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
 	osb->slot_recovery_generations =
 		kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
 			GFP_KERNEL);
-- 
cgit v1.2.3


From 7bdb0d18bfd381cc5491eb95973ec5604b356c7e Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Mon, 11 Oct 2010 16:46:39 +0800
Subject: ocfs2: Add a mount option "coherency=*" to handle cluster coherency
 for O_DIRECT writes.

Currently, the default behavior of O_DIRECT writes was allowing
concurrent writing among nodes to the same file, with no cluster
coherency guaranteed (no EX lock held).  This can leave stale data in
the cache for buffered reads on other nodes.

The new mount option introduce a chance to choose two different
behaviors for O_DIRECT writes:

    * coherency=full, as the default value, will disallow
                      concurrent O_DIRECT writes by taking
                      EX locks.

    * coherency=buffered, allow concurrent O_DIRECT writes
                          without EX lock among nodes, which
                          gains high performance at risk of
                          getting stale data on other nodes.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/ocfs2.txt |  7 +++++++
 fs/ocfs2/file.c                     | 29 +++++++++++++++++++++++++++--
 fs/ocfs2/ocfs2.h                    |  3 +++
 fs/ocfs2/super.c                    | 15 +++++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 1f7ae144f6d8..5393e6611691 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -87,3 +87,10 @@ dir_resv_level=	(*)	By default, directory reservations will scale with file
 			reservations - users should rarely need to change this
 			value. If allocation reservations are turned off, this
 			option will have no effect.
+coherency=full  (*)	Disallow concurrent O_DIRECT writes, cluster inode
+			lock will be taken to force other nodes drop cache,
+			therefore full cluster coherency is guaranteed even
+			for O_DIRECT writes.
+coherency=buffered	Allow concurrent O_DIRECT writes without EX lock among
+			nodes, which gains high performance at risk of getting
+			stale data on other nodes.
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 13af9937bdda..9e8cc4346b76 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2225,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
@@ -2248,14 +2250,37 @@ relock:
 		have_alloc_sem = 1;
 	}
 
-	/* concurrent O_DIRECT writes are allowed */
-	rw_level = !direct_io;
+	/*
+	 * Concurrent O_DIRECT writes are allowed with
+	 * mount_option "coherency=buffered".
+	 */
+	rw_level = (!direct_io || full_coherency);
+
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_sems;
 	}
 
+	/*
+	 * O_DIRECT writes with "coherency=full" need to take EX cluster
+	 * inode_lock to guarantee coherency.
+	 */
+	if (direct_io && full_coherency) {
+		/*
+		 * We need to take and drop the inode lock to force
+		 * other nodes to drop their caches.  Buffered I/O
+		 * already does this in write_begin().
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_sems;
+		}
+
+		ocfs2_inode_unlock(inode, 1);
+	}
+
 	can_do_direct = direct_io;
 	ret = ocfs2_prepare_inode_for_write(file, ppos,
 					    iocb->ki_left, appending,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 687e291d73f2..3064feef1430 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -263,6 +263,9 @@ enum ocfs2_mount_options
 						   control lists */
 	OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
 	OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+
+	OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT
+						    writes */
 };
 
 #define OCFS2_OSB_SOFT_RO			0x0001
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b578644b6637..9122d59f8127 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -177,6 +177,8 @@ enum {
 	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_coherency_buffered,
+	Opt_coherency_full,
 	Opt_resv_level,
 	Opt_dir_resv_level,
 	Opt_err,
@@ -205,6 +207,8 @@ static const match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_coherency_buffered, "coherency=buffered"},
+	{Opt_coherency_full, "coherency=full"},
 	{Opt_resv_level, "resv_level=%u"},
 	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_err, NULL}
@@ -1452,6 +1456,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_grpquota:
 			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
 			break;
+		case Opt_coherency_buffered:
+			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_coherency_full:
+			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
 		case Opt_acl:
 			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1550,6 +1560,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
 		seq_printf(s, ",grpquota");
 
+	if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+		seq_printf(s, ",coherency=buffered");
+	else
+		seq_printf(s, ",coherency=full");
+
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
 	else
-- 
cgit v1.2.3


From ecec6e34e18660799444c5a163c7313a20fba701 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 11 Oct 2010 16:49:44 -0400
Subject: nfsd4: expire clients more promptly

Expire clients more promptly, at the expense of possibly running the
laundromat thread more frequently.

Though it's not the default, I'd like it to be feasible to run with a
lease time of just a few seconds, at which point a minimum 10 second
wait between laundromat runs seems a little much.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..6b641cf2c19a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -249,7 +249,7 @@ extern time_t nfsd4_grace;
 #define	COMPOUND_SLACK_SPACE		140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE	12     /* OP_SETATTR */
 
-#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
 
 /*
  * The following attributes are currently not supported by the NFSv4 server:
-- 
cgit v1.2.3


From 9daa42e22030f0c5c357a5e1f8af658411beda6b Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Sun, 10 Oct 2010 13:21:05 -0500
Subject: CIFS ntlm authentication and signing - Build a proper av/ti pair blob
 for ntlmv2 without extended security authentication

Build an av pair blob as part of ntlmv2 (without extended security) auth
request.  Include netbios and dns names for domain and server and
a timestamp in the blob.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 80 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 69 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 89fb94fac4b5..e3edd8a6840b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -263,27 +263,85 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
-/* This is just a filler for ntlmv2 type of security mechanisms.
- * Older servers are not very particular about the contents of av pairs
- * in the blob and for sec mechs like ntlmv2, there is no negotiation
- * as in ntlmssp, so unless domain and server  netbios and dns names
- * are specified, there is no way to obtain name.  In case of ntlmssp,
- * server provides that info in type 2 challenge packet
+/* Build a proper attribute value/target info pairs blob.
+ * Fill in netbios and dns domain name and workstation name
+ * and client time (total five av pairs and + one end of fields indicator.
+ * Allocate domain name which gets freed when session struct is deallocated.
  */
 static int
-build_avpair_blob(struct cifsSesInfo *ses)
+build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
+	unsigned int dlen;
+	unsigned int wlen;
+	unsigned int size = 6 * sizeof(struct ntlmssp2_name);
+	__le64  curtime;
+	char *defdmname = "WORKGROUP";
+	unsigned char *blobptr;
 	struct ntlmssp2_name *attrptr;
 
-	ses->tilen = 2 * sizeof(struct ntlmssp2_name);
+	if (!ses->domainName) {
+		ses->domainName = kstrdup(defdmname, GFP_KERNEL);
+		if (!ses->domainName)
+			return -ENOMEM;
+	}
+
+	dlen = strlen(ses->domainName);
+	wlen = strlen(ses->server->hostname);
+
+	/* The length of this blob is a size which is
+	 * six times the size of a structure which holds name/size +
+	 * two times the unicode length of a domain name +
+	 * two times the unicode length of a server name +
+	 * size of a timestamp (which is 8 bytes).
+	 */
+	ses->tilen = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
 	ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL);
 	if (!ses->tiblob) {
 		ses->tilen = 0;
 		cERROR(1, "Challenge target info allocation failure");
 		return -ENOMEM;
 	}
-	attrptr = (struct ntlmssp2_name *) ses->tiblob;
-	attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
+
+	blobptr = ses->tiblob;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	blobptr += 2 * dlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
+	attrptr->length = cpu_to_le16(2 * wlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+
+	blobptr += 2 * wlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	blobptr += 2 * dlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
+	attrptr->length = cpu_to_le16(2 * wlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+
+	blobptr += 2 * wlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
+	attrptr->length = cpu_to_le16(sizeof(__le64));
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	memcpy(blobptr, &curtime, sizeof(__le64));
 
 	return 0;
 }
@@ -429,7 +487,7 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 			}
 		}
 	} else {
-		rc = build_avpair_blob(ses);
+		rc = build_avpair_blob(ses, nls_cp);
 		if (rc) {
 			cERROR(1, "error %d building av pair blob", rc);
 			return rc;
-- 
cgit v1.2.3


From 4a9410355406c31695eab9daeba694d7d9714e9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Sep 2010 14:33:08 +0000
Subject: hfs: Convert tree_lock to mutex

tree_lock is used as mutex so make it a mutex.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
LKML-Reference: <20100907125056.416332114@linutronix.de>
---
 fs/hfs/bfind.c | 4 ++--
 fs/hfs/btree.c | 2 +-
 fs/hfs/btree.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
 	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-	down(&tree->tree_lock);
+	mutex_lock(&tree->tree_lock);
 	return 0;
 }
 
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
 	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-	up(&fd->tree->tree_lock);
+	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	if (!tree)
 		return NULL;
 
-	init_MUTEX(&tree->tree_lock);
+	mutex_init(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
 	/* Set the correct compare function */
 	tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
 	unsigned int depth;
 
 	//unsigned int map1_size, map_size;
-	struct semaphore tree_lock;
+	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
 	spinlock_t hash_lock;
-- 
cgit v1.2.3


From 756b0322e50aebc4b9afb4488a2d3f6c802b4e64 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Sep 2010 14:33:11 +0000
Subject: affs: Use sema_init instead of init_MUTEX

Get rid of init_MUTE() and use sema_init() instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
LKML-Reference: <20100907125056.511395595@linutronix.de>
---
 fs/affs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef470..9581ea94d5a1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -109,8 +109,8 @@ static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
-	init_MUTEX(&ei->i_link_lock);
-	init_MUTEX(&ei->i_ext_lock);
+	sema_init(&ei->i_link_lock, 1);
+	sema_init(&ei->i_ext_lock, 1);
 	inode_init_once(&ei->vfs_inode);
 }
 
-- 
cgit v1.2.3


From 1c456013e96f10915578dc61095d19b4906f64ac Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 12 Oct 2010 11:32:42 -0400
Subject: cifs: on multiuser mount, set ownership to
 current_fsuid/current_fsgid (try #7)

commit 3aa1c8c2900065a51268430ab48a1b42fdfe5b45 made cifs_getattr set
the ownership of files to current_fsuid/current_fsgid when multiuser
mounts were in use and when mnt_uid/mnt_gid were non-zero.

It should have instead based that decision on the
CIFS_MOUNT_OVERR_UID/GID flags.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f629159be4e0..94979309698a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1755,7 +1755,7 @@ check_inval:
 }
 
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-	struct kstat *stat)
+		 struct kstat *stat)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
 	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -1765,11 +1765,17 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blksize = CIFS_MAX_MSGSIZE;
 		stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+
+		/*
+		 * If on a multiuser mount without unix extensions, and the
+		 * admin hasn't overridden them, set the ownership to the
+		 * fsuid/fsgid of the current process.
+		 */
 		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
 		    !tcon->unix_ext) {
-			if (!cifs_sb->mnt_uid)
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
 				stat->uid = current_fsuid();
-			if (!cifs_sb->mnt_uid)
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
 				stat->gid = current_fsgid();
 		}
 	}
-- 
cgit v1.2.3


From a5e18bc36e9e05ce0338d370a2ce4290910e43ea Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 11 Oct 2010 15:07:18 -0400
Subject: cifs: keep dentry reference in cifsFileInfo instead of inode
 reference

cifsFileInfo is a bit problematic. It contains a reference back to the
struct file itself. This makes it difficult for a cifsFileInfo to exist
without a corresponding struct file.

It would be better instead of the cifsFileInfo just held info pertaining
to the open file on the server instead without any back refrences to the
struct file. This would allow it to exist after the filp to which it was
originally attached was closed.

Much of the use of the file pointer in this struct is to get at the
dentry.  Begin divorcing the cifsFileInfo from the struct file by
keeping a reference to the dentry. Since the dentry will have a
reference to the inode, we can eliminate the "pInode" field too and
convert the igrab/iput to dget/dput.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 4 ++--
 fs/cifs/dir.c      | 3 ++-
 fs/cifs/file.c     | 2 +-
 fs/cifs/misc.c     | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4f85dfdf197d..8289e61937a2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -387,7 +387,7 @@ struct cifsFileInfo {
 	/* BB add lock scope info here if needed */ ;
 	/* lock scope id (0 if none) */
 	struct file *pfile; /* needed for writepage */
-	struct inode *pInode; /* needed for oplock break */
+	struct dentry *dentry;
 	struct vfsmount *mnt;
 	struct tcon_link *tlink;
 	struct mutex lock_mutex;
@@ -412,7 +412,7 @@ static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
 	if (atomic_dec_and_test(&cifs_file->count)) {
 		cifs_put_tlink(cifs_file->tlink);
-		iput(cifs_file->pInode);
+		dput(cifs_file->dentry);
 		kfree(cifs_file);
 	}
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e249b561ce8f..6887c412c61a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -135,6 +135,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 		  struct vfsmount *mnt, struct tcon_link *tlink,
 		  unsigned int oflags, __u32 oplock)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 
@@ -145,7 +146,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 	pCifsFile->netfid = fileHandle;
 	pCifsFile->pid = current->tgid;
 	pCifsFile->uid = current_fsuid();
-	pCifsFile->pInode = igrab(newinode);
+	pCifsFile->dentry = dget(dentry);
 	pCifsFile->mnt = mnt;
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 80856f180711..c302b9c52644 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2335,7 +2335,7 @@ void cifs_oplock_break(struct work_struct *work)
 {
 	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
 						  oplock_break);
-	struct inode *inode = cfile->pInode;
+	struct inode *inode = cfile->dentry->d_inode;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	int rc, waitrc = 0;
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 252f2768db84..9bac3e74b314 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -578,7 +578,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				}
 
 				cFYI(1, "file id match, oplock break");
-				pCifsInode = CIFS_I(netfile->pInode);
+				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 				pCifsInode->clientCanCacheAll = false;
 				if (pSMB->OplockLevel == 0)
 					pCifsInode->clientCanCacheRead = false;
-- 
cgit v1.2.3


From d7c86ff8cd00abc730fe5d031f43dc9138b6324e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 11 Oct 2010 15:07:19 -0400
Subject: cifs: don't use vfsmount to pin superblock for oplock breaks

Filesystems aren't really supposed to do anything with a vfsmount. It's
considered a layering violation since vfsmounts are entirely managed at
the VFS layer.

CIFS currently keeps an active reference to a vfsmount in order to
prevent the superblock vanishing before an oplock break has completed.
What we really want to do instead is to keep sb->s_active high until the
oplock break has completed. This patch borrows the scheme that NFS uses
for handling sillyrenames.

An atomic_t is added to the cifs_sb_info. When it transitions from 0 to
1, an extra reference to the superblock is taken (by bumping the
s_active value). When it transitions from 1 to 0, that reference is
dropped and a the superblock teardown may proceed if there are no more
references to it.

Also, the vfsmount pointer is removed from cifsFileInfo and from
cifs_new_fileinfo, and some bogus forward declarations are removed from
cifsfs.h.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |  1 +
 fs/cifs/cifsfs.c     | 18 ++++++++++++++++++
 fs/cifs/cifsfs.h     |  6 ++----
 fs/cifs/cifsglob.h   |  1 -
 fs/cifs/cifsproto.h  |  2 +-
 fs/cifs/dir.c        | 10 +++-------
 fs/cifs/file.c       |  9 ++++-----
 7 files changed, 29 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 586ee3d527d2..525ba59a4105 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -48,6 +48,7 @@ struct cifs_sb_info {
 	struct nls_table *local_nls;
 	unsigned int rsize;
 	unsigned int wsize;
+	atomic_t active;
 	uid_t	mnt_uid;
 	gid_t	mnt_gid;
 	mode_t	mnt_file_mode;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3258c822328b..cbd468c880c4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -83,6 +83,24 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
+void
+cifs_sb_active(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+
 static int
 cifs_read_super(struct super_block *sb, void *data,
 		const char *devname, int silent)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 786bdf36aebf..85544bcab517 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
-/* extern const struct super_operations cifs_super_ops;*/
-extern void cifs_read_inode(struct inode *);
-/*extern void cifs_delete_inode(struct inode *);*/  /* BB not needed yet */
-/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
+extern void cifs_sb_active(struct super_block *sb);
+extern void cifs_sb_deactive(struct super_block *sb);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8289e61937a2..e2b760ef22ff 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -388,7 +388,6 @@ struct cifsFileInfo {
 	/* lock scope id (0 if none) */
 	struct file *pfile; /* needed for writepage */
 	struct dentry *dentry;
-	struct vfsmount *mnt;
 	struct tcon_link *tlink;
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 29a2ee8ae51f..7f416abd34cf 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -107,7 +107,7 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 
 extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
-				struct vfsmount *mnt, struct tcon_link *tlink,
+				struct tcon_link *tlink,
 				unsigned int oflags, __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6887c412c61a..c205ec9293ea 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -132,8 +132,7 @@ cifs_bp_rename_retry:
 
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
-		  struct vfsmount *mnt, struct tcon_link *tlink,
-		  unsigned int oflags, __u32 oplock)
+		  struct tcon_link *tlink, unsigned int oflags, __u32 oplock)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct cifsFileInfo *pCifsFile;
@@ -147,7 +146,6 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 	pCifsFile->pid = current->tgid;
 	pCifsFile->uid = current_fsuid();
 	pCifsFile->dentry = dget(dentry);
-	pCifsFile->mnt = mnt;
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
@@ -485,8 +483,7 @@ cifs_create_set_dentry:
 		}
 
 		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-						nd->path.mnt, tlink, oflags,
-						oplock);
+						tlink, oflags, oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
 			CIFSSMBClose(xid, tcon, fileHandle);
@@ -760,8 +757,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 			}
 
 			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
-						  nd->path.mnt, tlink,
-						  nd->intent.open.flags,
+						  tlink, nd->intent.open.flags,
 						  oplock);
 			if (cfile == NULL) {
 				fput(filp);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c302b9c52644..fd78a355f634 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -282,7 +282,6 @@ int cifs_open(struct inode *inode, struct file *file)
 			}
 
 			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
-							file->f_path.mnt,
 							tlink, oflags, oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
@@ -375,8 +374,8 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (rc != 0)
 		goto out;
 
-	pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
-					tlink, file->f_flags, oplock);
+	pCifsFile = cifs_new_fileinfo(inode, netfid, file, tlink,
+					file->f_flags, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
@@ -2381,14 +2380,14 @@ void cifs_oplock_break(struct work_struct *work)
 
 void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 {
-	mntget(cfile->mnt);
+	cifs_sb_active(cfile->dentry->d_sb);
 	cifsFileInfo_get(cfile);
 }
 
 void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
-	mntput(cfile->mnt);
 	cifsFileInfo_put(cfile);
+	cifs_sb_deactive(cfile->dentry->d_sb);
 }
 
 const struct address_space_operations cifs_addr_ops = {
-- 
cgit v1.2.3


From ee52716245877b821f5ddbb3ace85b73084fb450 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 14 Oct 2010 09:53:37 -0400
Subject: hfsplus: fix oops on mount with corrupted btree extent records

A particular fsfuzzer run caused an hfs file system to crash on mount. This
is due to a corrupted MDB extent record causing a miscalculation of
HFSPLUS_I(inode)->first_blocks for the extent tree. If the extent records
are zereod out, then it won't trigger the first_blocks special case and
instead falls through to the extent code, which we're in the middle
of initializing.

This patch catches the 0 size extent records, reports the corruption,
and fails the mount.

[hch: ported of commit 47f365eb575735c6b2edf5d08e0d16d26a9c23bd from hfs]

Reported-by: Ramon de Carvalho Valle <rcvalle@linux.vnet.ibm.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/btree.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 8306479f6b3a..d4bd864ce108 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 	tree->inode = inode;
 
+	if (!HFSPLUS_I(tree->inode)->first_blocks) {
+		printk(KERN_ERR
+		       "hfs: invalid btree extent records (0 size).\n");
+		goto free_inode;
+	}
+
 	mapping = tree->inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-		goto free_tree;
+		goto free_inode;
 
 	/* Load the header */
 	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -89,8 +95,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
  fail_page:
 	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	page_cache_release(page);
- free_tree:
+ free_inode:
 	iput(tree->inode);
+ free_tree:
 	kfree(tree);
 	return NULL;
 }
-- 
cgit v1.2.3


From b6b41424f0ec28e9a167fa29b003327860b4b71b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Thu, 14 Oct 2010 09:53:42 -0400
Subject: hfsplus: hfs_bnode_find() can fail, resulting in hfs_bnode_split()
 breakage

oops and fs corruption; the latter can happen even on valid fs in case of oom.

[hch: port of commit 3d10a15d6919488204bdb264050d156ced20d9aa from hfs]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/brec.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..fa903641992f 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -216,7 +216,7 @@ skip:
 static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 {
 	struct hfs_btree *tree;
-	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode *node, *new_node, *next_node;
 	struct hfs_bnode_desc node_desc;
 	int num_recs, new_rec_off, new_off, old_rec_off;
 	int data_start, data_end, size;
@@ -235,6 +235,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	new_node->type = node->type;
 	new_node->height = node->height;
 
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
 	size = tree->node_size / 2 - node->num_recs * 2 - 14;
 	old_rec_off = tree->node_size - 4;
 	num_recs = 1;
@@ -248,6 +259,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 		/* panic? */
 		hfs_bnode_put(node);
 		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
 		return ERR_PTR(-ENOSPC);
 	}
 
@@ -302,8 +315,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
 
 	/* update next bnode header */
-	if (new_node->next) {
-		struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
+	if (next_node) {
 		next_node->prev = new_node->this;
 		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
 		node_desc.prev = cpu_to_be32(next_node->prev);
-- 
cgit v1.2.3


From 9250f925972d03ccc0c0a4dd4e9b794d2ef6d52b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 14 Oct 2010 09:53:48 -0400
Subject: hfsplus: handle more on-disk corruptions without oopsing

hfs seems prone to bad things when it encounters on disk corruption.  Many
values are read from disk, and used as lengths to memcpy, as an example.
This patch fixes up several of these problematic cases.

o sanity check the on-disk maximum key lengths on mount
  (these are set to a defined value at mkfs time and shouldn't differ)
o check on-disk node keylens against the maximum key length for each tree
o fix hfs_btree_open so that going out via free_tree: doesn't wind
  up in hfs_releasepage, which wants to follow the very pointer
  we were trying to set up:
	HFS_SB(sb)->cat_tree = hfs_btree_open()
    .
  failure gets to hfs_releasepage and tries to follow HFS_SB(sb)->cat_tree

Tested with the fsfuzzer; it survives more than it used to.

[hch: ported of commit cf0594625083111ae522496dc1c256f7476939c2 from hfs]
[hch: added the fixes from 5581d018ed3493d226e7a4d645d9c8a5af6c36b]

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bfind.c       | 13 +++++++++++++
 fs/hfsplus/brec.c        | 15 +++++++++++++--
 fs/hfsplus/btree.c       | 25 ++++++++++++++++++++-----
 fs/hfsplus/hfsplus_raw.h |  3 ++-
 4 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 68c7983f0289..d182438c7ae4 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 		rec = (e + b) / 2;
 		len = hfs_brec_lenoff(bnode, rec, &off);
 		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
 		if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 	if (rec != e && e >= 0) {
 		len = hfs_brec_lenoff(bnode, e, &off);
 		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 	}
 done:
@@ -75,6 +83,7 @@ done:
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
 	fd->entrylength = len - keylen;
+fail:
 	return res;
 }
 
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
 
 	len = hfs_brec_lenoff(bnode, fd->record, &off);
 	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
 	fd->keyoffset = off;
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index fa903641992f..790cd6a8605d 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,21 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
 		if (!recoff)
 			return 0;
-		if (node->tree->attributes & HFS_TREE_BIGKEYS)
+		if (node->tree->attributes & HFS_TREE_BIGKEYS) {
 			retval = hfs_bnode_read_u16(node, recoff) + 2;
-		else
+			if (retval > node->tree->max_key_len + 2) {
+				printk(KERN_ERR "hfs: keylen %d too large\n",
+					retval);
+				retval = 0;
+			}
+		} else {
 			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+			if (retval > node->tree->max_key_len + 1) {
+				printk(KERN_ERR "hfs: keylen %d too large\n",
+					retval);
+				retval = 0;
+			}
+		}
 	}
 	return retval;
 }
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index d4bd864ce108..82caff215e08 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -63,10 +63,23 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	tree->max_key_len = be16_to_cpu(head->max_key_len);
 	tree->depth = be16_to_cpu(head->depth);
 
-	/* Set the correct compare function */
-	if (id == HFSPLUS_EXT_CNID) {
+	/* Verify the tree and set the correct compare function */
+	switch (id) {
+	case HFSPLUS_EXT_CNID:
+		if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
 		tree->keycmp = hfsplus_ext_cmp_key;
-	} else if (id == HFSPLUS_CAT_CNID) {
+		break;
+	case HFSPLUS_CAT_CNID:
+		if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+
 		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
 		    (head->key_type == HFSPLUS_KEY_BINARY))
 			tree->keycmp = hfsplus_cat_bin_cmp_key;
@@ -74,7 +87,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 			tree->keycmp = hfsplus_cat_case_cmp_key;
 			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 		}
-	} else {
+		break;
+	default:
 		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
 		goto fail_page;
 	}
@@ -84,6 +98,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto fail_page;
 	if (!tree->node_count)
 		goto fail_page;
+
 	tree->node_size_shift = ffs(size) - 1;
 
 	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -93,9 +108,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	return tree;
 
  fail_page:
-	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	page_cache_release(page);
  free_inode:
+	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	iput(tree->inode);
  free_tree:
 	kfree(tree);
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..6892899fd6fb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
 	struct hfsplus_unistr name;
 } __packed;
 
+#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
 
 /* Structs from hfs.h */
 struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
 	__be32 start_block;
 } __packed;
 
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
 
 /* HFS+ generic BTree key */
 typedef union {
-- 
cgit v1.2.3


From 13571a6977f821fab7d9c3cc5f75da52b7732e40 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Thu, 14 Oct 2010 09:54:23 -0400
Subject: hfsplus: validate btree flags

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/brec.c  | 20 ++++++--------------
 fs/hfsplus/btree.c | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 790cd6a8605d..2f39d05443e1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,20 +42,12 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
 		if (!recoff)
 			return 0;
-		if (node->tree->attributes & HFS_TREE_BIGKEYS) {
-			retval = hfs_bnode_read_u16(node, recoff) + 2;
-			if (retval > node->tree->max_key_len + 2) {
-				printk(KERN_ERR "hfs: keylen %d too large\n",
-					retval);
-				retval = 0;
-			}
-		} else {
-			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
-			if (retval > node->tree->max_key_len + 1) {
-				printk(KERN_ERR "hfs: keylen %d too large\n",
-					retval);
-				retval = 0;
-			}
+
+		retval = hfs_bnode_read_u16(node, recoff) + 2;
+		if (retval > node->tree->max_key_len + 2) {
+			printk(KERN_ERR "hfs: keylen %d too large\n",
+				retval);
+			retval = 0;
 		}
 	}
 	return retval;
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 82caff215e08..22e4d4e32999 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -71,6 +71,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 				tree->max_key_len);
 			goto fail_page;
 		}
+		if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+			printk(KERN_ERR "hfs: invalid extent btree flag\n");
+			goto fail_page;
+		}
+
 		tree->keycmp = hfsplus_ext_cmp_key;
 		break;
 	case HFSPLUS_CAT_CNID:
@@ -79,6 +84,10 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 				tree->max_key_len);
 			goto fail_page;
 		}
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+			goto fail_page;
+		}
 
 		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
 		    (head->key_type == HFSPLUS_KEY_BINARY))
@@ -93,6 +102,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto fail_page;
 	}
 
+	if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+		printk(KERN_ERR "hfs: invalid btree flag\n");
+		goto fail_page;
+	}
+
 	size = tree->node_size;
 	if (!is_power_of_2(size))
 		goto fail_page;
-- 
cgit v1.2.3


From f6089ff87d309a8ddb7b0d4dd92a570f1b0f689b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Thu, 14 Oct 2010 09:54:28 -0400
Subject: hfsplus: fix link corruption

HFS implements hardlink by using indirect catalog entries that refer to a hidden
directly.  The link target is cached in the dev field in the HFS+ specific
inode, which is also used for the device number for device files, and inside
for passing the nlink value of the indirect node from hfsplus_cat_write_inode
to a helper function.  Now if we happen to write out the indirect node while
hfsplus_link is creating the catalog entry we'll get a link pointing to the
linkid of the current nlink value.  This can easily be reproduced by a large
enough loop of local git-clone operations.

Stop abusing the dev field in the HFS+ inode for short term storage by
refactoring the way the permission structure in the catalog entry is
set up, and rename the dev field to linkid to avoid any confusion.

While we're at it also prevent creating hard links to special files, as
the HFS+ dev and linkid share the same space in the on-disk structure.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/catalog.c    |  2 +-
 fs/hfsplus/dir.c        |  6 ++++--
 fs/hfsplus/hfsplus_fs.h |  8 +++++---
 fs/hfsplus/inode.c      | 14 ++++++++------
 4 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 48979c4e8fa5..9d1594b0a07a 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -134,7 +134,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
 			file->user_info.fdFlags = cpu_to_be16(0x100);
 			file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
-			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->dev);
+			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
 		}
 		return sizeof(*file);
 	}
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 33aab211695a..c05c8776e836 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -102,7 +102,7 @@ again:
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (S_ISREG(inode->i_mode))
-		HFSPLUS_I(inode)->dev = linkid;
+		HFSPLUS_I(inode)->linkid = linkid;
 out:
 	d_add(dentry, inode);
 	return NULL;
@@ -252,6 +252,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
 
 	mutex_lock(&sbi->vh_mutex);
 	if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
@@ -268,7 +270,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			if (res != -EEXIST)
 				goto out;
 		}
-		HFSPLUS_I(inode)->dev = id;
+		HFSPLUS_I(inode)->linkid = id;
 		cnid = sbi->next_cnid++;
 		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
 		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5cda96366acf..d92f590d6633 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -178,7 +178,11 @@ struct hfsplus_inode_info {
 	 */
 	struct inode *rsrc_inode;
 	__be32 create_date;
-	u32 dev;
+
+	/*
+	 * Protected by sbi->vh_mutex.
+	 */
+	u32 linkid;
 
 	/*
 	 * Protected by i_mutex.
@@ -427,6 +431,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 #define hfsp_ut2mt(t)		__hfsp_ut2mt((t).tv_sec)
 #define hfsp_now2mt()		__hfsp_ut2mt(get_seconds())
 
-#define kdev_t_to_nr(x)		(x)
-
 #endif
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a05b3afa7230..746e0ee20717 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,13 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
 	perms->group = cpu_to_be32(inode->i_gid);
-	perms->dev = cpu_to_be32(HFSPLUS_I(inode)->dev);
+
+	if (S_ISREG(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_nlink);
+	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_rdev);
+	else
+		perms->dev = 0;
 }
 
 static int hfsplus_file_open(struct inode *inode, struct file *file)
@@ -491,7 +497,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 
-	HFSPLUS_I(inode)->dev = 0;
+	HFSPLUS_I(inode)->linkid = 0;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 
@@ -595,10 +601,6 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
-		if (S_ISREG(inode->i_mode))
-			HFSPLUS_I(inode)->dev = inode->i_nlink;
-		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-			HFSPLUS_I(inode)->dev = kdev_t_to_nr(inode->i_rdev);
 		hfsplus_set_perms(inode, &file->permissions);
 		if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
-- 
cgit v1.2.3


From 722c55d13e7296cc62ed8a38f926a915ff32e4ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Thu, 14 Oct 2010 09:54:33 -0400
Subject: hfsplus: remove superflous rootflags field in hfsplus_inode_info

The rootflags field in hfsplus_inode_info only caches the immutable and
append-only flags in the VFS inode, so we can easily get rid of it.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/catalog.c    |  1 -
 fs/hfsplus/hfsplus_fs.h |  2 +-
 fs/hfsplus/inode.c      |  1 -
 fs/hfsplus/ioctl.c      | 25 +++++++++++--------------
 4 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 9d1594b0a07a..bbcf5a27b5d3 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -77,7 +77,6 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->rootflags |= HFSPLUS_FLG_APPEND;
 	else
 		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	HFSPLUS_I(inode)->rootflags = perms->rootflags;
 	HFSPLUS_I(inode)->userflags = perms->userflags;
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d92f590d6633..0d77844a00e8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -188,7 +188,7 @@ struct hfsplus_inode_info {
 	 * Protected by i_mutex.
 	 */
 	sector_t fs_blocks;
-	u8 rootflags, userflags;	/* BSD system and user file flags */
+	u8 userflags;		/* BSD user file flags */
 	struct list_head open_dir_list;
 	loff_t phys_size;
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 746e0ee20717..df57ea1f3a71 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -241,7 +241,6 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
 		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
 	inode->i_mode = mode;
 
-	HFSPLUS_I(inode)->rootflags = perms->rootflags;
 	HFSPLUS_I(inode)->userflags = perms->userflags;
 	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index c9ac443d202a..80eb5b3a5edd 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -26,9 +26,9 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags = 0;
 
-	if (hip->rootflags & HFSPLUS_FLG_IMMUTABLE)
+	if (inode->i_flags & S_IMMUTABLE)
 		flags |= FS_IMMUTABLE_FL;
-	if (hip->rootflags & HFSPLUS_FLG_APPEND)
+	if (inode->i_flags |= S_APPEND)
 		flags |= FS_APPEND_FL;
 	if (hip->userflags & HFSPLUS_FLG_NODUMP)
 		flags |= FS_NODUMP_FL;
@@ -59,8 +59,8 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 
 	mutex_lock(&inode->i_mutex);
 
-	if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-	    hip->rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
+	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
 			err = -EPERM;
 			goto out_unlock_inode;
@@ -72,20 +72,17 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 		err = -EOPNOTSUPP;
 		goto out_unlock_inode;
 	}
-	if (flags & FS_IMMUTABLE_FL) {
+
+	if (flags & FS_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
-		hip->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-	} else {
+	else
 		inode->i_flags &= ~S_IMMUTABLE;
-		hip->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-	}
-	if (flags & FS_APPEND_FL) {
+
+	if (flags & FS_APPEND_FL)
 		inode->i_flags |= S_APPEND;
-		hip->rootflags |= HFSPLUS_FLG_APPEND;
-	} else {
+	else
 		inode->i_flags &= ~S_APPEND;
-		hip->rootflags &= ~HFSPLUS_FLG_APPEND;
-	}
+
 	if (flags & FS_NODUMP_FL)
 		hip->userflags |= HFSPLUS_FLG_NODUMP;
 	else
-- 
cgit v1.2.3


From 90e616905a423126805186cb5754e10a704b30c8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Thu, 14 Oct 2010 09:54:39 -0400
Subject: hfsplus: create correct initial catalog entries for device files

Make sure the initial insertation of the catalog entry already contains
the device number by calling init_special_inode early and setting writing
out the dev field of the on-disk permission structure.  The latter is
facilitated by sharing the almost identical hfsplus_set_perms helpers
between initial catalog entry creating and ->write_inode.

Unless we crashed just after mknod this bug was harmless as the inode
is marked dirty at the end of hfsplus_mknod, and hfsplus_write_inode
will update the catalog entry to contain the correct value.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/catalog.c    | 16 ++++++++++++----
 fs/hfsplus/dir.c        |  6 +++---
 fs/hfsplus/hfsplus_fs.h |  1 +
 fs/hfsplus/inode.c      | 27 ++-------------------------
 4 files changed, 18 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index bbcf5a27b5d3..8af45fc5b051 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
 	key->key_len = cpu_to_be16(6 + ustrlen);
 }
 
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 {
 	if (inode->i_flags & S_IMMUTABLE)
 		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,10 +77,18 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->rootflags |= HFSPLUS_FLG_APPEND;
 	else
 		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	HFSPLUS_I(inode)->userflags = perms->userflags;
+
+	perms->userflags = HFSPLUS_I(inode)->userflags;
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
 	perms->group = cpu_to_be32(inode->i_gid);
+
+	if (S_ISREG(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_nlink);
+	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_rdev);
+	else
+		perms->dev = 0;
 }
 
 static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
@@ -99,7 +107,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			folder->content_mod_date =
 			folder->attribute_mod_date =
 			folder->access_date = hfsp_now2mt();
-		hfsplus_set_perms(inode, &folder->permissions);
+		hfsplus_cat_set_perms(inode, &folder->permissions);
 		if (inode == sbi->hidden_dir)
 			/* invisible and namelocked */
 			folder->user_info.frFlags = cpu_to_be16(0x5000);
@@ -118,7 +126,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			file->attribute_mod_date =
 			file->access_date = hfsp_now2mt();
 		if (cnid == inode->i_ino) {
-			hfsplus_set_perms(inode, &file->permissions);
+			hfsplus_cat_set_perms(inode, &file->permissions);
 			if (S_ISLNK(inode->i_mode)) {
 				file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
 				file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index c05c8776e836..d236d85ec9d7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -418,6 +418,9 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 	if (!inode)
 		goto out;
 
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+		init_special_inode(inode, mode, rdev);
+
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
 		inode->i_nlink = 0;
@@ -426,9 +429,6 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 		goto out;
 	}
 
-	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
-		init_special_inode(inode, mode, rdev);
-
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
 out:
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 0d77844a00e8..d77dfd22ea00 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -326,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
 int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
 		       struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
 
 /* dir.c */
 extern const struct inode_operations hfsplus_dir_inode_operations;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index df57ea1f3a71..78449280dae0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -252,29 +252,6 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
 		inode->i_flags &= ~S_APPEND;
 }
 
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
-	if (inode->i_flags & S_IMMUTABLE)
-		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-	else
-		perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-	if (inode->i_flags & S_APPEND)
-		perms->rootflags |= HFSPLUS_FLG_APPEND;
-	else
-		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	perms->userflags = HFSPLUS_I(inode)->userflags;
-	perms->mode = cpu_to_be16(inode->i_mode);
-	perms->owner = cpu_to_be32(inode->i_uid);
-	perms->group = cpu_to_be32(inode->i_gid);
-
-	if (S_ISREG(inode->i_mode))
-		perms->dev = cpu_to_be32(inode->i_nlink);
-	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
-		perms->dev = cpu_to_be32(inode->i_rdev);
-	else
-		perms->dev = 0;
-}
-
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
@@ -578,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_folder));
 		/* simple node checks? */
-		hfsplus_set_perms(inode, &folder->permissions);
+		hfsplus_cat_set_perms(inode, &folder->permissions);
 		folder->access_date = hfsp_ut2mt(inode->i_atime);
 		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
 		folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -600,7 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
-		hfsplus_set_perms(inode, &file->permissions);
+		hfsplus_cat_set_perms(inode, &file->permissions);
 		if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
-- 
cgit v1.2.3


From 32e39e19ccb0f25c9c1b7ff246e17e795366bbbd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Thu, 14 Oct 2010 09:54:43 -0400
Subject: hfsplus: remove the unused hfsplus_kmap/hfsplus_kunmap helpers

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d77dfd22ea00..cb3653efb57a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -398,14 +398,6 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
 }
 
-#if 1
-#define hfsplus_kmap(p)		({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p)	({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p)		kmap(p)
-#define hfsplus_kunmap(p)	kunmap(p)
-#endif
-
 #define sb_bread512(sb, sec, data) ({			\
 	struct buffer_head *__bh;			\
 	sector_t __block;				\
-- 
cgit v1.2.3


From 5d0d28824c76409f0d1a645bf0ae81318c8ffa42 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Wed, 13 Oct 2010 18:15:00 -0500
Subject: NTLM authentication and signing - Calculate auth response per smb
 session

Start calculation auth response within a session.  Move/Add pertinet
data structures like session key, server challenge and ntlmv2_hash in
a session structure.  We should do the calculations within a session
before copying session key and response over to server data
structures because a session setup can fail.

Only after a very first smb session succeeds, it copies/makes its
session key, session key of smb connection.  This key stays with
the smb connection throughout its life.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 16 ++++++++--------
 fs/cifs/cifsglob.h    |  6 ++++--
 fs/cifs/cifssmb.c     |  4 ++--
 fs/cifs/connect.c     | 18 ++++++++++++++----
 fs/cifs/sess.c        | 28 ++++++++--------------------
 5 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e3edd8a6840b..7ac0056294cf 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -458,7 +458,7 @@ calc_exit_1:
 calc_exit_2:
 	/* BB FIXME what about bytes 24 through 40 of the signing key?
 	   compare with the NTLM example */
-	hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+	hmac_md5_final(ses->ntlmv2_hash, pctxt);
 
 	kfree(pctxt);
 	return rc;
@@ -502,14 +502,14 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 	}
 	CalcNTLMv2_response(ses, resp_buf);
 
-	/* now calculate the MAC key for NTLMv2 */
-	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+	/* now calculate the session key for NTLMv2 */
+	hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
 	hmac_md5_update(resp_buf, 16, &context);
-	hmac_md5_final(ses->server->session_key.data.ntlmv2.key, &context);
+	hmac_md5_final(ses->auth_key.data.ntlmv2.key, &context);
 
-	memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
+	memcpy(&ses->auth_key.data.ntlmv2.resp, resp_buf,
 	       sizeof(struct ntlmv2_resp));
-	ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
+	ses->auth_key.len = 16 + sizeof(struct ntlmv2_resp);
 
 	return 0;
 
@@ -526,8 +526,8 @@ void CalcNTLMv2_response(const struct cifsSesInfo *ses,
 {
 	struct HMACMD5Context context;
 	/* rest of v2 struct already generated */
-	memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
-	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+	memcpy(v2_session_response + 8, ses->cryptKey, 8);
+	hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
 
 	hmac_md5_update(v2_session_response+8,
 			sizeof(struct ntlmv2_resp) - 8, &context);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e2b760ef22ff..6c69bd762498 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -179,12 +179,10 @@ struct TCP_Server_Info {
 	int capabilities; /* allow selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u16 CurrentMid;         /* multiplex id - rotating counter */
-	char cryptKey[CIFS_CRYPTO_KEY_SIZE];
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* needed for CIFS PDU signature */
 	struct session_key session_key;
-	char ntlmv2_hash[16];
 	unsigned long lstrp; /* when we got last response from this server */
 	u16 dialect; /* dialect index that server chose */
 	/* extended security flavors that server supports */
@@ -192,6 +190,7 @@ struct TCP_Server_Info {
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_ntlmssp;		/* supports NTLMSSP */
+	bool session_estab; /* mark when very first sess is established */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
@@ -223,6 +222,9 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
+	char cryptKey[CIFS_CRYPTO_KEY_SIZE];
+	struct session_key auth_key;
+	char ntlmv2_hash[16];
 	unsigned int tilen; /* length of the target info blob */
 	unsigned char *tiblob; /* target info blob in challenge response */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 54bd83af772c..a420c7b08522 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 		if (rsp->EncryptionKeyLength ==
 				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-			memcpy(server->cryptKey, rsp->EncryptionKey,
+			memcpy(ses->cryptKey, rsp->EncryptionKey,
 				CIFS_CRYPTO_KEY_SIZE);
 		} else if (server->secMode & SECMODE_PW_ENCRYPT) {
 			rc = -EIO; /* need cryptkey unless plain text */
@@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
 	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-		memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
+		memcpy(ses->cryptKey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
 	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
 			&& (pSMBr->EncryptionKeyLength == 0)) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4944fc84d5ef..019f00380d12 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -173,6 +173,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
+	server->sequence_number = 0;
+	server->session_estab = false;
 
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
@@ -205,7 +207,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			spin_lock(&GlobalMid_Lock);
 			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsGood;
-			server->sequence_number = 0;
 			spin_unlock(&GlobalMid_Lock);
 	/*		atomic_set(&server->inFlight,0);*/
 			wake_up(&server->response_q);
@@ -1631,6 +1632,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
 	memcpy(tcp_ses->server_RFC1001_name,
 		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+	tcp_ses->session_estab = false;
 	tcp_ses->sequence_number = 0;
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -2983,14 +2985,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
 		    (ses->server->secType == LANMAN))
-			calc_lanman_hash(tcon->password, ses->server->cryptKey,
+			calc_lanman_hash(tcon->password, ses->cryptKey,
 					 ses->server->secMode &
 					    SECMODE_PW_ENCRYPT ? true : false,
 					 bcc_ptr);
 		else
 #endif /* CIFS_WEAK_PW_HASH */
-		SMBNTencrypt(tcon->password, ses->server->cryptKey,
-			     bcc_ptr);
+		SMBNTencrypt(tcon->password, ses->cryptKey, bcc_ptr);
 
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
@@ -3175,6 +3176,15 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 	if (rc) {
 		cERROR(1, "Send error in SessSetup = %d", rc);
 	} else {
+		mutex_lock(&ses->server->srv_mutex);
+		if (!server->session_estab) {
+			memcpy(&server->session_key.data,
+				&ses->auth_key.data, ses->auth_key.len);
+			server->session_key.len = ses->auth_key.len;
+			ses->server->session_estab = true;
+		}
+		mutex_unlock(&server->srv_mutex);
+
 		cFYI(1, "CIFS Session Established successfully");
 		spin_lock(&GlobalMid_Lock);
 		ses->status = CifsGood;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c926e6c7c0c6..2111bed71b1f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -402,7 +402,7 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 		return -EINVAL;
 	}
 
-	memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	memcpy(ses->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
 	/* BB we could decode pblob->NegotiateFlags; some may be useful */
 	/* In particular we can examine sign flags */
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
@@ -591,17 +591,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	int bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-	bool first_time;
 	int blob_len;
 	char *ntlmsspblob = NULL;
 
 	if (ses == NULL)
 		return -EINVAL;
 
-	read_lock(&cifs_tcp_ses_lock);
-	first_time = is_first_ses_reconnect(ses);
-	read_unlock(&cifs_tcp_ses_lock);
-
 	type = ses->server->secType;
 
 	cFYI(1, "sess setup type %d", type);
@@ -672,7 +667,7 @@ ssetup_ntlmssp_authenticate:
 		/* BB calculate hash with password */
 		/* and copy into bcc */
 
-		calc_lanman_hash(ses->password, ses->server->cryptKey,
+		calc_lanman_hash(ses->password, ses->cryptKey,
 				 ses->server->secMode & SECMODE_PW_ENCRYPT ?
 					true : false, lnm_session_key);
 
@@ -699,15 +694,11 @@ ssetup_ntlmssp_authenticate:
 			cpu_to_le16(CIFS_SESS_KEY_SIZE);
 
 		/* calculate session key */
-		SMBNTencrypt(ses->password, ses->server->cryptKey,
-			     ntlm_session_key);
+		SMBNTencrypt(ses->password, ses->cryptKey, ntlm_session_key);
 
-		if (first_time) /* should this be moved into common code
-				  with similar ntlmv2 path? */
-			cifs_calculate_session_key(&ses->server->session_key,
-				ntlm_session_key, ses->password);
+		cifs_calculate_session_key(&ses->auth_key,
+					ntlm_session_key, ses->password);
 		/* copy session key */
-
 		memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
@@ -794,17 +785,14 @@ ssetup_ntlmssp_authenticate:
 		}
 		/* bail out if key is too long */
 		if (msg->sesskey_len >
-		    sizeof(ses->server->session_key.data.krb5)) {
+		    sizeof(ses->auth_key.data.krb5)) {
 			cERROR(1, "Kerberos signing key too long (%u bytes)",
 				msg->sesskey_len);
 			rc = -EOVERFLOW;
 			goto ssetup_exit;
 		}
-		if (first_time) {
-			ses->server->session_key.len = msg->sesskey_len;
-			memcpy(ses->server->session_key.data.krb5,
-				msg->data, msg->sesskey_len);
-		}
+		ses->auth_key.len = msg->sesskey_len;
+		memcpy(ses->auth_key.data.krb5, msg->data, msg->sesskey_len);
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities = cpu_to_le32(capabilities);
-- 
cgit v1.2.3


From 8fd01d6cfbf75465d84a4e533ed70c5f57b3ff51 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 14 Oct 2010 19:15:28 -0700
Subject: Export dump_{write,seek} to binary loader modules

If you build aout support as a module, you'll want these exported.

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 03278c984ba0..6d2b6f936858 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2024,6 +2024,7 @@ int dump_write(struct file *file, const void *addr, int nr)
 {
 	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
 }
+EXPORT_SYMBOL(dump_write);
 
 int dump_seek(struct file *file, loff_t off)
 {
@@ -2052,3 +2053,4 @@ int dump_seek(struct file *file, loff_t off)
 	}
 	return ret;
 }
+EXPORT_SYMBOL(dump_seek);
-- 
cgit v1.2.3


From 46bf36ecec79bbe5373ef68f0ed36cbf62e03482 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Fri, 15 Oct 2010 05:45:00 -0700
Subject: hfsplus: fix getxattr return value

We need to support -EOPNOTSUPP for attributes that are not supported to
match other filesystems and allow userspace to detect if Posix ACLs
are supported or not.  setxattr already gets this right.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 80eb5b3a5edd..5b4667e08ef7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -192,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 		} else
 			res = size ? -ERANGE : 4;
 	} else
-		res = -ENODATA;
+		res = -EOPNOTSUPP;
 out:
 	if (size)
 		hfs_find_exit(&fd);
-- 
cgit v1.2.3


From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 15 Aug 2010 18:52:59 +0200
Subject: llseek: automatically add .llseek fop

All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.

The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.

New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time.  Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.

The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.

Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.

Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.

===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
//   but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}

@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}

@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
   *off = E
|
   *off += E
|
   func(..., off, ...)
|
   E = *off
)
...+>
}

@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}

@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
  *off = E
|
  *off += E
|
  func(..., off, ...)
|
  E = *off
)
...+>
}

@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}

@ fops0 @
identifier fops;
@@
struct file_operations fops = {
 ...
};

@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
 .llseek = llseek_f,
...
};

@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
 .read = read_f,
...
};

@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
 .write = write_f,
...
};

@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
 .open = open_f,
...
};

// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
...  .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};

@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
...  .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};

// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
...  .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};

// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};

// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};

@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+	.llseek = default_llseek, /* write accesses f_pos */
};

// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////

@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
 .write = write_f,
 .read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};

@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};

@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};

@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
---
 arch/arm/kernel/etm.c                            |  1 +
 arch/arm/mach-msm/last_radio_log.c               |  3 ++-
 arch/arm/mach-msm/smd_debug.c                    |  1 +
 arch/arm/plat-mxc/audmux-v2.c                    |  1 +
 arch/avr32/boards/mimc200/fram.c                 |  1 +
 arch/blackfin/kernel/kgdb_test.c                 |  1 +
 arch/blackfin/mach-bf561/coreb.c                 |  1 +
 arch/cris/arch-v10/drivers/ds1302.c              |  1 +
 arch/cris/arch-v10/drivers/gpio.c                |  1 +
 arch/cris/arch-v10/drivers/i2c.c                 |  1 +
 arch/cris/arch-v10/drivers/pcf8563.c             |  1 +
 arch/cris/arch-v10/drivers/sync_serial.c         |  3 ++-
 arch/cris/arch-v32/drivers/cryptocop.c           |  3 ++-
 arch/cris/arch-v32/drivers/i2c.c                 |  1 +
 arch/cris/arch-v32/drivers/mach-a3/gpio.c        |  1 +
 arch/cris/arch-v32/drivers/mach-fs/gpio.c        |  1 +
 arch/cris/arch-v32/drivers/pcf8563.c             |  1 +
 arch/cris/arch-v32/drivers/sync_serial.c         |  3 ++-
 arch/cris/kernel/profile.c                       |  1 +
 arch/ia64/kernel/salinfo.c                       |  2 ++
 arch/ia64/sn/kernel/sn2/sn_hwperf.c              |  1 +
 arch/m68k/bvme6000/rtc.c                         |  1 +
 arch/m68k/mvme16x/rtc.c                          |  1 +
 arch/mips/kernel/rtlx.c                          |  3 ++-
 arch/mips/kernel/vpe.c                           |  3 ++-
 arch/mips/sibyte/common/sb_tbprof.c              |  1 +
 arch/powerpc/kernel/lparcfg.c                    |  1 +
 arch/powerpc/kernel/rtas_flash.c                 |  3 +++
 arch/powerpc/kernel/rtasd.c                      |  1 +
 arch/powerpc/platforms/iseries/mf.c              |  1 +
 arch/powerpc/platforms/pseries/reconfig.c        |  3 ++-
 arch/powerpc/platforms/pseries/scanlog.c         |  1 +
 arch/s390/crypto/prng.c                          |  1 +
 arch/s390/hypfs/hypfs_diag.c                     |  1 +
 arch/s390/hypfs/hypfs_vm.c                       |  1 +
 arch/s390/hypfs/inode.c                          |  1 +
 arch/s390/kernel/debug.c                         |  1 +
 arch/sh/boards/mach-landisk/gio.c                |  1 +
 arch/sparc/kernel/apc.c                          |  1 +
 arch/sparc/kernel/mdesc.c                        |  1 +
 arch/tile/kernel/hardwall.c                      |  1 +
 arch/um/drivers/harddog_kern.c                   |  1 +
 arch/um/drivers/mconsole_kern.c                  |  1 +
 arch/um/drivers/mmapper_kern.c                   |  1 +
 arch/um/drivers/random.c                         |  1 +
 arch/x86/kernel/apm_32.c                         |  1 +
 arch/x86/kernel/cpu/mcheck/mce-severity.c        |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c                 |  1 +
 arch/x86/kernel/kdebugfs.c                       |  1 +
 arch/x86/kernel/microcode_core.c                 |  1 +
 arch/x86/kernel/tlb_uv.c                         |  1 +
 arch/x86/xen/debugfs.c                           |  1 +
 block/bsg.c                                      |  1 +
 drivers/acpi/apei/erst-dbg.c                     |  1 +
 drivers/acpi/debugfs.c                           |  1 +
 drivers/acpi/ec_sys.c                            |  1 +
 drivers/acpi/event.c                             |  1 +
 drivers/block/DAC960.c                           |  3 ++-
 drivers/block/aoe/aoechr.c                       |  1 +
 drivers/block/paride/pg.c                        |  1 +
 drivers/block/paride/pt.c                        |  1 +
 drivers/block/pktcdvd.c                          |  1 +
 drivers/bluetooth/btmrvl_debugfs.c               | 10 +++++++
 drivers/bluetooth/hci_vhci.c                     |  1 +
 drivers/char/apm-emulation.c                     |  1 +
 drivers/char/bfin-otp.c                          |  1 +
 drivers/char/briq_panel.c                        |  1 +
 drivers/char/bsr.c                               |  1 +
 drivers/char/cs5535_gpio.c                       |  3 ++-
 drivers/char/ds1302.c                            |  1 +
 drivers/char/ds1620.c                            |  1 +
 drivers/char/dsp56k.c                            |  1 +
 drivers/char/dtlk.c                              |  1 +
 drivers/char/genrtc.c                            |  1 +
 drivers/char/hw_random/core.c                    |  1 +
 drivers/char/ip2/ip2main.c                       |  1 +
 drivers/char/ipmi/ipmi_devintf.c                 |  1 +
 drivers/char/ipmi/ipmi_watchdog.c                |  1 +
 drivers/char/istallion.c                         |  1 +
 drivers/char/lp.c                                |  1 +
 drivers/char/mem.c                               |  3 +++
 drivers/char/misc.c                              |  1 +
 drivers/char/mmtimer.c                           |  1 +
 drivers/char/mspec.c                             |  9 ++++---
 drivers/char/mwave/mwavedd.c                     |  3 ++-
 drivers/char/nwbutton.c                          |  1 +
 drivers/char/pc8736x_gpio.c                      |  1 +
 drivers/char/pcmcia/cm4000_cs.c                  |  1 +
 drivers/char/pcmcia/cm4040_cs.c                  |  1 +
 drivers/char/random.c                            |  2 ++
 drivers/char/rio/rio_linux.c                     |  1 +
 drivers/char/scx200_gpio.c                       |  1 +
 drivers/char/snsc.c                              |  1 +
 drivers/char/stallion.c                          |  1 +
 drivers/char/sx.c                                |  1 +
 drivers/char/sysrq.c                             |  1 +
 drivers/char/tb0219.c                            |  1 +
 drivers/char/tlclk.c                             |  1 +
 drivers/char/toshiba.c                           |  1 +
 drivers/char/uv_mmtimer.c                        |  1 +
 drivers/char/xilinx_hwicap/xilinx_hwicap.c       |  1 +
 drivers/dma/coh901318.c                          |  1 +
 drivers/firewire/nosy.c                          |  1 +
 drivers/gpu/drm/drm_drv.c                        |  3 ++-
 drivers/gpu/drm/i810/i810_dma.c                  |  1 +
 drivers/gpu/drm/i830/i830_dma.c                  |  1 +
 drivers/gpu/drm/i915/i915_debugfs.c              |  1 +
 drivers/gpu/vga/vgaarb.c                         |  1 +
 drivers/hid/hid-debug.c                          |  1 +
 drivers/hid/hid-roccat.c                         |  1 +
 drivers/hid/hidraw.c                             |  1 +
 drivers/hid/usbhid/hiddev.c                      |  1 +
 drivers/hwmon/asus_atk0110.c                     |  1 +
 drivers/ide/ide-tape.c                           |  1 +
 drivers/idle/i7300_idle.c                        |  1 +
 drivers/infiniband/hw/ipath/ipath_diag.c         |  4 ++-
 drivers/infiniband/hw/ipath/ipath_file_ops.c     |  3 ++-
 drivers/infiniband/hw/ipath/ipath_fs.c           |  3 +++
 drivers/infiniband/hw/qib/qib_diag.c             |  4 ++-
 drivers/infiniband/hw/qib/qib_file_ops.c         |  3 ++-
 drivers/infiniband/hw/qib/qib_fs.c               |  1 +
 drivers/input/evdev.c                            |  3 ++-
 drivers/input/input.c                            |  1 +
 drivers/input/joydev.c                           |  1 +
 drivers/input/misc/uinput.c                      |  1 +
 drivers/input/mousedev.c                         |  1 +
 drivers/input/serio/serio_raw.c                  |  1 +
 drivers/isdn/mISDN/timerdev.c                    |  1 +
 drivers/lguest/lguest_user.c                     |  1 +
 drivers/macintosh/ans-lcd.c                      |  1 +
 drivers/macintosh/via-pmu.c                      |  1 +
 drivers/md/dm-ioctl.c                            |  1 +
 drivers/media/IR/imon.c                          |  6 +++--
 drivers/media/IR/lirc_dev.c                      |  1 +
 drivers/media/dvb/bt8xx/dst_ca.c                 |  3 ++-
 drivers/media/dvb/dvb-core/dmxdev.c              |  2 ++
 drivers/media/dvb/dvb-core/dvb_ca_en50221.c      |  1 +
 drivers/media/dvb/dvb-core/dvb_frontend.c        |  3 ++-
 drivers/media/dvb/dvb-core/dvb_net.c             |  1 +
 drivers/media/dvb/dvb-core/dvbdev.c              |  1 +
 drivers/media/dvb/firewire/firedtv-ci.c          |  1 +
 drivers/media/dvb/ttpci/av7110.c                 |  1 +
 drivers/media/dvb/ttpci/av7110_av.c              |  2 ++
 drivers/media/dvb/ttpci/av7110_ca.c              |  1 +
 drivers/media/dvb/ttpci/av7110_ir.c              |  1 +
 drivers/mfd/ab3100-core.c                        |  1 +
 drivers/misc/hpilo.c                             |  1 +
 drivers/misc/phantom.c                           |  1 +
 drivers/misc/sgi-gru/grufile.c                   |  1 +
 drivers/mmc/core/debugfs.c                       |  1 +
 drivers/mtd/ubi/cdev.c                           |  1 +
 drivers/net/caif/caif_spi.c                      |  6 +++--
 drivers/net/cxgb4/cxgb4_main.c                   |  1 +
 drivers/net/ppp_generic.c                        |  3 ++-
 drivers/net/wimax/i2400m/debugfs.c               |  2 ++
 drivers/net/wireless/airo.c                      | 24 +++++++++++------
 drivers/net/wireless/ath/ath5k/debug.c           |  7 +++++
 drivers/net/wireless/ath/ath9k/debug.c           | 33 ++++++++++++++++--------
 drivers/net/wireless/ath/ath9k/htc_drv_main.c    |  9 ++++---
 drivers/net/wireless/iwlwifi/iwl-3945-rs.c       |  1 +
 drivers/net/wireless/iwlwifi/iwl-agn-rs.c        |  3 +++
 drivers/net/wireless/iwmc3200wifi/debugfs.c      |  4 +++
 drivers/net/wireless/iwmc3200wifi/sdio.c         |  1 +
 drivers/net/wireless/libertas/debugfs.c          |  1 +
 drivers/net/wireless/ray_cs.c                    |  2 ++
 drivers/net/wireless/rt2x00/rt2x00debug.c        |  4 +++
 drivers/net/wireless/wl12xx/wl1251_debugfs.c     |  2 ++
 drivers/net/wireless/wl12xx/wl1271_debugfs.c     |  4 ++-
 drivers/oprofile/oprofile_files.c                |  8 +++++-
 drivers/oprofile/oprofilefs.c                    |  3 +++
 drivers/pci/pcie/aer/aer_inject.c                |  1 +
 drivers/platform/x86/sony-laptop.c               |  1 +
 drivers/rtc/rtc-m41t80.c                         |  1 +
 drivers/s390/block/dasd_eer.c                    |  1 +
 drivers/s390/char/fs3270.c                       |  1 +
 drivers/s390/char/monreader.c                    |  1 +
 drivers/s390/char/monwriter.c                    |  1 +
 drivers/s390/char/tape_char.c                    |  1 +
 drivers/s390/char/vmcp.c                         |  1 +
 drivers/s390/char/vmlogrdr.c                     |  1 +
 drivers/s390/char/vmwatchdog.c                   |  1 +
 drivers/s390/char/zcore.c                        |  2 ++
 drivers/s390/cio/chsc_sch.c                      |  1 +
 drivers/s390/cio/css.c                           |  1 +
 drivers/s390/crypto/zcrypt_api.c                 |  3 ++-
 drivers/s390/scsi/zfcp_cfdc.c                    |  3 ++-
 drivers/sbus/char/display7seg.c                  |  1 +
 drivers/sbus/char/envctrl.c                      |  1 +
 drivers/scsi/3w-9xxx.c                           |  3 ++-
 drivers/scsi/3w-sas.c                            |  3 ++-
 drivers/scsi/3w-xxxx.c                           |  3 ++-
 drivers/scsi/aacraid/linit.c                     |  1 +
 drivers/scsi/ch.c                                |  1 +
 drivers/scsi/dpt_i2o.c                           |  1 +
 drivers/scsi/gdth.c                              |  1 +
 drivers/scsi/megaraid.c                          |  1 +
 drivers/scsi/megaraid/megaraid_mm.c              |  1 +
 drivers/scsi/megaraid/megaraid_sas.c             |  1 +
 drivers/scsi/mpt2sas/mpt2sas_ctl.c               |  1 +
 drivers/scsi/osd/osd_uld.c                       |  1 +
 drivers/scsi/pmcraid.c                           |  1 +
 drivers/scsi/qla2xxx/qla_os.c                    |  1 +
 drivers/scsi/scsi_tgt_if.c                       |  1 +
 drivers/scsi/sg.c                                |  1 +
 drivers/serial/mfd.c                             |  2 ++
 drivers/spi/dw_spi.c                             |  1 +
 drivers/spi/spidev.c                             |  1 +
 drivers/staging/comedi/comedi_fops.c             |  1 +
 drivers/staging/crystalhd/crystalhd_lnx.c        |  1 +
 drivers/staging/dream/camera/msm_camera.c        |  3 +++
 drivers/staging/dream/pmem.c                     |  2 ++
 drivers/staging/dream/qdsp5/adsp_driver.c        |  1 +
 drivers/staging/dream/qdsp5/audio_aac.c          |  1 +
 drivers/staging/dream/qdsp5/audio_amrnb.c        |  1 +
 drivers/staging/dream/qdsp5/audio_evrc.c         |  1 +
 drivers/staging/dream/qdsp5/audio_in.c           |  2 ++
 drivers/staging/dream/qdsp5/audio_mp3.c          |  1 +
 drivers/staging/dream/qdsp5/audio_out.c          |  2 ++
 drivers/staging/dream/qdsp5/audio_qcelp.c        |  1 +
 drivers/staging/dream/qdsp5/evlog.h              |  1 +
 drivers/staging/dream/qdsp5/snd.c                |  1 +
 drivers/staging/frontier/alphatrack.c            |  1 +
 drivers/staging/frontier/tranzport.c             |  1 +
 drivers/staging/iio/industrialio-core.c          |  1 +
 drivers/staging/iio/industrialio-ring.c          |  1 +
 drivers/staging/lirc/lirc_imon.c                 |  3 ++-
 drivers/staging/lirc/lirc_it87.c                 |  1 +
 drivers/staging/lirc/lirc_sasem.c                |  1 +
 drivers/staging/memrar/memrar_handler.c          |  1 +
 drivers/staging/panel/panel.c                    |  1 +
 drivers/staging/tidspbridge/rmgr/drv_interface.c |  1 +
 drivers/telephony/ixj.c                          |  3 ++-
 drivers/telephony/phonedev.c                     |  1 +
 drivers/uio/uio.c                                |  1 +
 drivers/usb/class/cdc-wdm.c                      |  3 ++-
 drivers/usb/class/usblp.c                        |  1 +
 drivers/usb/class/usbtmc.c                       |  1 +
 drivers/usb/core/file.c                          |  1 +
 drivers/usb/gadget/f_hid.c                       |  1 +
 drivers/usb/gadget/printer.c                     |  3 ++-
 drivers/usb/host/ehci-dbg.c                      |  4 +++
 drivers/usb/host/ohci-dbg.c                      |  3 +++
 drivers/usb/image/mdc800.c                       |  1 +
 drivers/usb/misc/adutux.c                        |  1 +
 drivers/usb/misc/idmouse.c                       |  1 +
 drivers/usb/misc/iowarrior.c                     |  1 +
 drivers/usb/misc/ldusb.c                         |  1 +
 drivers/usb/misc/rio500.c                        |  1 +
 drivers/usb/misc/usblcd.c                        |  1 +
 drivers/usb/usb-skeleton.c                       |  1 +
 drivers/vhost/net.c                              |  1 +
 drivers/video/fbmem.c                            |  1 +
 drivers/video/mbx/mbxdebugfs.c                   |  6 +++++
 drivers/watchdog/ar7_wdt.c                       |  1 +
 drivers/watchdog/cpwd.c                          |  1 +
 drivers/watchdog/ep93xx_wdt.c                    |  1 +
 drivers/watchdog/omap_wdt.c                      |  1 +
 drivers/xen/evtchn.c                             |  1 +
 drivers/xen/xenfs/super.c                        |  1 +
 drivers/xen/xenfs/xenbus.c                       |  1 +
 fs/afs/mntpt.c                                   |  1 +
 fs/autofs4/dev-ioctl.c                           |  1 +
 fs/binfmt_misc.c                                 |  3 +++
 fs/btrfs/super.c                                 |  1 +
 fs/cachefiles/daemon.c                           |  1 +
 fs/char_dev.c                                    |  1 +
 fs/coda/pioctl.c                                 |  1 +
 fs/coda/psdev.c                                  |  1 +
 fs/debugfs/file.c                                |  3 +++
 fs/dlm/debug_fs.c                                |  3 ++-
 fs/dlm/plock.c                                   |  3 ++-
 fs/dlm/user.c                                    |  3 +++
 fs/ecryptfs/file.c                               |  1 +
 fs/ecryptfs/miscdev.c                            |  1 +
 fs/eventfd.c                                     |  1 +
 fs/eventpoll.c                                   |  3 ++-
 fs/fifo.c                                        |  1 +
 fs/fuse/control.c                                |  4 +++
 fs/fuse/cuse.c                                   |  1 +
 fs/gfs2/file.c                                   |  2 ++
 fs/hppfs/hppfs.c                                 |  1 +
 fs/hugetlbfs/inode.c                             |  1 +
 fs/logfs/dir.c                                   |  1 +
 fs/nfsd/nfsctl.c                                 |  1 +
 fs/no-block.c                                    |  1 +
 fs/notify/fanotify/fanotify_user.c               |  1 +
 fs/notify/inotify/inotify_user.c                 |  1 +
 fs/ocfs2/dlmfs/dlmfs.c                           |  1 +
 fs/ocfs2/stack_user.c                            |  1 +
 fs/proc/base.c                                   |  8 ++++++
 fs/proc/proc_sysctl.c                            |  1 +
 fs/proc/root.c                                   |  1 +
 fs/proc/task_mmu.c                               |  1 +
 fs/romfs/super.c                                 |  1 +
 fs/signalfd.c                                    |  1 +
 fs/squashfs/dir.c                                |  3 ++-
 fs/timerfd.c                                     |  1 +
 fs/ubifs/debug.c                                 |  1 +
 ipc/mqueue.c                                     |  1 +
 ipc/shm.c                                        |  2 ++
 kernel/configs.c                                 |  1 +
 kernel/gcov/fs.c                                 |  1 +
 kernel/kprobes.c                                 |  1 +
 kernel/pm_qos_params.c                           |  1 +
 kernel/profile.c                                 |  1 +
 kernel/trace/blktrace.c                          |  2 ++
 kernel/trace/ftrace.c                            |  2 ++
 kernel/trace/ring_buffer.c                       |  1 +
 kernel/trace/trace_events.c                      |  6 +++++
 kernel/trace/trace_stack.c                       |  1 +
 lib/dma-debug.c                                  |  1 +
 net/atm/proc.c                                   |  1 +
 net/dccp/probe.c                                 |  1 +
 net/ipv4/tcp_probe.c                             |  1 +
 net/mac80211/debugfs.c                           | 19 +++++++++-----
 net/mac80211/rate.c                              |  1 +
 net/mac80211/rc80211_minstrel_debugfs.c          |  1 +
 net/mac80211/rc80211_pid_debugfs.c               |  1 +
 net/netfilter/xt_recent.c                        |  1 +
 net/nonet.c                                      |  1 +
 net/rfkill/core.c                                |  1 +
 net/sctp/probe.c                                 |  1 +
 net/socket.c                                     |  1 +
 net/sunrpc/cache.c                               |  2 ++
 net/wireless/debugfs.c                           |  1 +
 samples/kfifo/bytestream-example.c               |  1 +
 samples/kfifo/inttype-example.c                  |  1 +
 samples/kfifo/record-example.c                   |  1 +
 samples/tracepoints/tracepoint-sample.c          |  1 +
 security/apparmor/apparmorfs.c                   |  9 ++++---
 security/inode.c                                 |  1 +
 security/smack/smackfs.c                         |  5 ++++
 sound/core/seq/oss/seq_oss.c                     |  1 +
 sound/core/sound.c                               |  3 ++-
 sound/oss/msnd_pinnacle.c                        |  1 +
 sound/soc/soc-core.c                             |  1 +
 sound/soc/soc-dapm.c                             |  1 +
 sound/sound_core.c                               |  1 +
 virt/kvm/kvm_main.c                              |  3 +++
 339 files changed, 538 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/kernel/etm.c b/arch/arm/kernel/etm.c
index 33c7077174db..30b8878f1202 100644
--- a/arch/arm/kernel/etm.c
+++ b/arch/arm/kernel/etm.c
@@ -314,6 +314,7 @@ static const struct file_operations etb_fops = {
 	.read = etb_read,
 	.open = etb_open,
 	.release = etb_release,
+	.llseek = no_llseek,
 };
 
 static struct miscdevice etb_miscdev = {
diff --git a/arch/arm/mach-msm/last_radio_log.c b/arch/arm/mach-msm/last_radio_log.c
index b64ba5a98686..1e243f46a969 100644
--- a/arch/arm/mach-msm/last_radio_log.c
+++ b/arch/arm/mach-msm/last_radio_log.c
@@ -48,7 +48,8 @@ static ssize_t last_radio_log_read(struct file *file, char __user *buf,
 }
 
 static struct file_operations last_radio_log_fops = {
-	.read = last_radio_log_read
+	.read = last_radio_log_read,
+	.llseek = default_llseek,
 };
 
 void msm_init_last_radio_log(struct module *owner)
diff --git a/arch/arm/mach-msm/smd_debug.c b/arch/arm/mach-msm/smd_debug.c
index 3b2dd717b788..f91c3b7bc655 100644
--- a/arch/arm/mach-msm/smd_debug.c
+++ b/arch/arm/mach-msm/smd_debug.c
@@ -212,6 +212,7 @@ static int debug_open(struct inode *inode, struct file *file)
 static const struct file_operations debug_ops = {
 	.read = debug_read,
 	.open = debug_open,
+	.llseek = default_llseek,
 };
 
 static void debug_create(const char *name, mode_t mode,
diff --git a/arch/arm/plat-mxc/audmux-v2.c b/arch/arm/plat-mxc/audmux-v2.c
index f9e7cdbd0005..25ad95ba92a4 100644
--- a/arch/arm/plat-mxc/audmux-v2.c
+++ b/arch/arm/plat-mxc/audmux-v2.c
@@ -137,6 +137,7 @@ static ssize_t audmux_read_file(struct file *file, char __user *user_buf,
 static const struct file_operations audmux_debugfs_fops = {
 	.open = audmux_open_file,
 	.read = audmux_read_file,
+	.llseek = default_llseek,
 };
 
 static void audmux_debugfs_init(void)
diff --git a/arch/avr32/boards/mimc200/fram.c b/arch/avr32/boards/mimc200/fram.c
index 54fbd95cee9b..9764a1a1073e 100644
--- a/arch/avr32/boards/mimc200/fram.c
+++ b/arch/avr32/boards/mimc200/fram.c
@@ -41,6 +41,7 @@ static int fram_mmap(struct file *filp, struct vm_area_struct *vma)
 static const struct file_operations fram_fops = {
 	.owner			= THIS_MODULE,
 	.mmap			= fram_mmap,
+	.llseek			= noop_llseek,
 };
 
 #define FRAM_MINOR	0
diff --git a/arch/blackfin/kernel/kgdb_test.c b/arch/blackfin/kernel/kgdb_test.c
index 9a4b07594389..08c0236acf3c 100644
--- a/arch/blackfin/kernel/kgdb_test.c
+++ b/arch/blackfin/kernel/kgdb_test.c
@@ -88,6 +88,7 @@ static const struct file_operations kgdb_test_proc_fops = {
 	.owner = THIS_MODULE,
 	.read  = kgdb_test_proc_read,
 	.write = kgdb_test_proc_write,
+	.llseek = noop_llseek,
 };
 
 static int __init kgdbtest_init(void)
diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c
index deb2271d09a3..c6a4c8f2d37b 100644
--- a/arch/blackfin/mach-bf561/coreb.c
+++ b/arch/blackfin/mach-bf561/coreb.c
@@ -51,6 +51,7 @@ coreb_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 static const struct file_operations coreb_fops = {
 	.owner          = THIS_MODULE,
 	.unlocked_ioctl = coreb_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice coreb_dev = {
diff --git a/arch/cris/arch-v10/drivers/ds1302.c b/arch/cris/arch-v10/drivers/ds1302.c
index 884275629ef7..95aadb4a8cf1 100644
--- a/arch/cris/arch-v10/drivers/ds1302.c
+++ b/arch/cris/arch-v10/drivers/ds1302.c
@@ -387,6 +387,7 @@ print_rtc_status(void)
 static const struct file_operations rtc_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl = rtc_unlocked_ioctl,
+	.llseek		= noop_llseek,
 }; 
 
 /* Probe for the chip by writing something to its RAM and try reading it back. */
diff --git a/arch/cris/arch-v10/drivers/gpio.c b/arch/cris/arch-v10/drivers/gpio.c
index a07b6d25b0c7..a276f0811731 100644
--- a/arch/cris/arch-v10/drivers/gpio.c
+++ b/arch/cris/arch-v10/drivers/gpio.c
@@ -745,6 +745,7 @@ static const struct file_operations gpio_fops = {
 	.write          = gpio_write,
 	.open           = gpio_open,
 	.release        = gpio_release,
+	.llseek		= noop_llseek,
 };
 
 static void ioif_watcher(const unsigned int gpio_in_available,
diff --git a/arch/cris/arch-v10/drivers/i2c.c b/arch/cris/arch-v10/drivers/i2c.c
index 77a941813819..c413539d4205 100644
--- a/arch/cris/arch-v10/drivers/i2c.c
+++ b/arch/cris/arch-v10/drivers/i2c.c
@@ -617,6 +617,7 @@ static const struct file_operations i2c_fops = {
 	.unlocked_ioctl	= i2c_ioctl,
 	.open		= i2c_open,
 	.release	= i2c_release,
+	.llseek		= noop_llseek,
 };
 
 int __init
diff --git a/arch/cris/arch-v10/drivers/pcf8563.c b/arch/cris/arch-v10/drivers/pcf8563.c
index 7dcb1f85f42b..7aa6ff00a117 100644
--- a/arch/cris/arch-v10/drivers/pcf8563.c
+++ b/arch/cris/arch-v10/drivers/pcf8563.c
@@ -64,6 +64,7 @@ static int voltage_low;
 static const struct file_operations pcf8563_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = pcf8563_unlocked_ioctl,
+	.llseek		= noop_llseek,
 };
 
 unsigned char
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index ee2dd4323daf..e501632fa8c2 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -250,7 +250,8 @@ static const struct file_operations sync_serial_fops = {
 	.poll		= sync_serial_poll,
 	.unlocked_ioctl	= sync_serial_ioctl,
 	.open		= sync_serial_open,
-	.release	= sync_serial_release
+	.release	= sync_serial_release,
+	.llseek		= noop_llseek,
 };
 
 static int __init etrax_sync_serial_init(void)
diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index b07646a30509..dcb43fddfb99 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -281,7 +281,8 @@ const struct file_operations cryptocop_fops = {
 	.owner		= THIS_MODULE,
 	.open		= cryptocop_open,
 	.release	= cryptocop_release,
-	.unlocked_ioctl = cryptocop_ioctl
+	.unlocked_ioctl = cryptocop_ioctl,
+	.llseek		= noop_llseek,
 };
 
 
diff --git a/arch/cris/arch-v32/drivers/i2c.c b/arch/cris/arch-v32/drivers/i2c.c
index 5a3e900c9a78..ddb23996f11a 100644
--- a/arch/cris/arch-v32/drivers/i2c.c
+++ b/arch/cris/arch-v32/drivers/i2c.c
@@ -698,6 +698,7 @@ static const struct file_operations i2c_fops = {
 	.unlocked_ioctl = i2c_ioctl,
 	.open		= i2c_open,
 	.release	= i2c_release,
+	.llseek		= noop_llseek,
 };
 
 static int __init i2c_init(void)
diff --git a/arch/cris/arch-v32/drivers/mach-a3/gpio.c b/arch/cris/arch-v32/drivers/mach-a3/gpio.c
index 2dcd27adbad4..06b24ebda410 100644
--- a/arch/cris/arch-v32/drivers/mach-a3/gpio.c
+++ b/arch/cris/arch-v32/drivers/mach-a3/gpio.c
@@ -893,6 +893,7 @@ static const struct file_operations gpio_fops = {
 	.write		= gpio_write,
 	.open		= gpio_open,
 	.release	= gpio_release,
+	.llseek		= noop_llseek,
 };
 
 #ifdef CONFIG_ETRAX_VIRTUAL_GPIO
diff --git a/arch/cris/arch-v32/drivers/mach-fs/gpio.c b/arch/cris/arch-v32/drivers/mach-fs/gpio.c
index 5ec8a7d4e7d7..0649c8bea676 100644
--- a/arch/cris/arch-v32/drivers/mach-fs/gpio.c
+++ b/arch/cris/arch-v32/drivers/mach-fs/gpio.c
@@ -870,6 +870,7 @@ static const struct file_operations gpio_fops = {
 	.write		= gpio_write,
 	.open		= gpio_open,
 	.release	= gpio_release,
+	.llseek		= noop_llseek,
 };
 
 #ifdef CONFIG_ETRAX_VIRTUAL_GPIO
diff --git a/arch/cris/arch-v32/drivers/pcf8563.c b/arch/cris/arch-v32/drivers/pcf8563.c
index bef6eb53b153..0f7b101ee5ee 100644
--- a/arch/cris/arch-v32/drivers/pcf8563.c
+++ b/arch/cris/arch-v32/drivers/pcf8563.c
@@ -60,6 +60,7 @@ static int voltage_low;
 static const struct file_operations pcf8563_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl = pcf8563_unlocked_ioctl,
+	.llseek		= noop_llseek,
 };
 
 unsigned char
diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c
index ca248f3adb80..a2e8a8c39856 100644
--- a/arch/cris/arch-v32/drivers/sync_serial.c
+++ b/arch/cris/arch-v32/drivers/sync_serial.c
@@ -247,7 +247,8 @@ static const struct file_operations sync_serial_fops = {
 	.poll		= sync_serial_poll,
 	.unlocked_ioctl	= sync_serial_ioctl,
 	.open		= sync_serial_open,
-	.release	= sync_serial_release
+	.release	= sync_serial_release,
+	.llseek		= noop_llseek,
 };
 
 static int __init etrax_sync_serial_init(void)
diff --git a/arch/cris/kernel/profile.c b/arch/cris/kernel/profile.c
index 195ec5fa0dd2..b82e08615d1b 100644
--- a/arch/cris/kernel/profile.c
+++ b/arch/cris/kernel/profile.c
@@ -59,6 +59,7 @@ write_cris_profile(struct file *file, const char __user *buf,
 static const struct file_operations cris_proc_profile_operations = {
 	.read		= read_cris_profile,
 	.write		= write_cris_profile,
+	.llseek		= default_llseek,
 };
 
 static int __init init_cris_profile(void)
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index aa8b5fa1a8de..6f22c4041630 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -354,6 +354,7 @@ retry:
 static const struct file_operations salinfo_event_fops = {
 	.open  = salinfo_event_open,
 	.read  = salinfo_event_read,
+	.llseek = noop_llseek,
 };
 
 static int
@@ -571,6 +572,7 @@ static const struct file_operations salinfo_data_fops = {
 	.release = salinfo_log_release,
 	.read    = salinfo_log_read,
 	.write   = salinfo_log_write,
+	.llseek  = default_llseek,
 };
 
 static int __cpuinit
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index fa1eceed0d23..30862c0358cd 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -860,6 +860,7 @@ error:
 
 static const struct file_operations sn_hwperf_fops = {
 	.unlocked_ioctl = sn_hwperf_ioctl,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice sn_hwperf_dev = {
diff --git a/arch/m68k/bvme6000/rtc.c b/arch/m68k/bvme6000/rtc.c
index cb8617bb194b..1c4d4c7bf4d4 100644
--- a/arch/m68k/bvme6000/rtc.c
+++ b/arch/m68k/bvme6000/rtc.c
@@ -155,6 +155,7 @@ static const struct file_operations rtc_fops = {
 	.unlocked_ioctl	= rtc_ioctl,
 	.open		= rtc_open,
 	.release	= rtc_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice rtc_dev = {
diff --git a/arch/m68k/mvme16x/rtc.c b/arch/m68k/mvme16x/rtc.c
index 11ac6f63967a..39c79ebcd18a 100644
--- a/arch/m68k/mvme16x/rtc.c
+++ b/arch/m68k/mvme16x/rtc.c
@@ -144,6 +144,7 @@ static const struct file_operations rtc_fops = {
 	.unlocked_ioctl	= rtc_ioctl,
 	.open		= rtc_open,
 	.release	= rtc_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice rtc_dev=
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c
index 26f9b9ab19cc..557ef72472e0 100644
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -468,7 +468,8 @@ static const struct file_operations rtlx_fops = {
 	.release = file_release,
 	.write =   file_write,
 	.read =    file_read,
-	.poll =    file_poll
+	.poll =    file_poll,
+	.llseek =  noop_llseek,
 };
 
 static struct irqaction rtlx_irq = {
diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 2bd2151c586a..3eb3cde2f661 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -1192,7 +1192,8 @@ static const struct file_operations vpe_fops = {
 	.owner = THIS_MODULE,
 	.open = vpe_open,
 	.release = vpe_release,
-	.write = vpe_write
+	.write = vpe_write,
+	.llseek = noop_llseek,
 };
 
 /* module wrapper entry points */
diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c
index d4ed7a9156f5..ca35b730d189 100644
--- a/arch/mips/sibyte/common/sb_tbprof.c
+++ b/arch/mips/sibyte/common/sb_tbprof.c
@@ -545,6 +545,7 @@ static const struct file_operations sbprof_tb_fops = {
 	.unlocked_ioctl	= sbprof_tb_ioctl,
 	.compat_ioctl	= sbprof_tb_ioctl,
 	.mmap		= NULL,
+	.llseek		= default_llseek,
 };
 
 static struct class *tb_class;
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 50362b6ef6e9..b1dd962e247e 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -780,6 +780,7 @@ static const struct file_operations lparcfg_fops = {
 	.write		= lparcfg_write,
 	.open		= lparcfg_open,
 	.release	= single_release,
+	.llseek		= seq_lseek,
 };
 
 static int __init lparcfg_init(void)
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 67a84d8f118d..2b442e6c21e6 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -716,6 +716,7 @@ static const struct file_operations rtas_flash_operations = {
 	.write		= rtas_flash_write,
 	.open		= rtas_excl_open,
 	.release	= rtas_flash_release,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations manage_flash_operations = {
@@ -724,6 +725,7 @@ static const struct file_operations manage_flash_operations = {
 	.write		= manage_flash_write,
 	.open		= rtas_excl_open,
 	.release	= rtas_excl_release,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations validate_flash_operations = {
@@ -732,6 +734,7 @@ static const struct file_operations validate_flash_operations = {
 	.write		= validate_flash_write,
 	.open		= rtas_excl_open,
 	.release	= validate_flash_release,
+	.llseek		= default_llseek,
 };
 
 static int __init rtas_flash_init(void)
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 638883e23e3a..0438f819fe6b 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -354,6 +354,7 @@ static const struct file_operations proc_rtas_log_operations = {
 	.poll =		rtas_log_poll,
 	.open =		rtas_log_open,
 	.release =	rtas_log_release,
+	.llseek =	noop_llseek,
 };
 
 static int enable_surveillance(int timeout)
diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c
index 33e5fc7334fc..42d0a886de05 100644
--- a/arch/powerpc/platforms/iseries/mf.c
+++ b/arch/powerpc/platforms/iseries/mf.c
@@ -1249,6 +1249,7 @@ out:
 
 static const struct file_operations proc_vmlinux_operations = {
 	.write		= proc_mf_change_vmlinux,
+	.llseek		= default_llseek,
 };
 
 static int __init mf_proc_init(void)
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 57ddbb43b33a..1de2cbb92303 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -539,7 +539,8 @@ out:
 }
 
 static const struct file_operations ofdt_fops = {
-	.write = ofdt_write
+	.write = ofdt_write,
+	.llseek = noop_llseek,
 };
 
 /* create /proc/powerpc/ofdt write-only by root */
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index 80e9e7652a4d..554457294a2b 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -170,6 +170,7 @@ const struct file_operations scanlog_fops = {
 	.write		= scanlog_write,
 	.open		= scanlog_open,
 	.release	= scanlog_release,
+	.llseek		= noop_llseek,
 };
 
 static int __init scanlog_init(void)
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index aa819dac2360..975e3ab13cb5 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -152,6 +152,7 @@ static const struct file_operations prng_fops = {
 	.open		= &prng_open,
 	.release	= NULL,
 	.read		= &prng_read,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice prng_dev = {
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 1211bb1d2f24..020e51c063d2 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -618,6 +618,7 @@ static const struct file_operations dbfs_d204_ops = {
 	.open		= dbfs_d204_open,
 	.read		= dbfs_d204_read,
 	.release	= dbfs_d204_release,
+	.llseek		= no_llseek,
 };
 
 static int hypfs_dbfs_init(void)
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index ee5ab1a578e7..26cf177f6a3a 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -275,6 +275,7 @@ static const struct file_operations dbfs_d2fc_ops = {
 	.open		= dbfs_d2fc_open,
 	.read		= dbfs_d2fc_read,
 	.release	= dbfs_d2fc_release,
+	.llseek		= no_llseek,
 };
 
 int hypfs_vm_init(void)
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 98a4a4c267a7..74d98670be27 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -449,6 +449,7 @@ static const struct file_operations hypfs_file_ops = {
 	.write		= do_sync_write,
 	.aio_read	= hypfs_aio_read,
 	.aio_write	= hypfs_aio_write,
+	.llseek		= no_llseek,
 };
 
 static struct file_system_type hypfs_type = {
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 98192261491d..5ad6bc078bfd 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -174,6 +174,7 @@ static const struct file_operations debug_file_ops = {
 	.write   = debug_input,
 	.open    = debug_open,
 	.release = debug_close,
+	.llseek  = no_llseek,
 };
 
 static struct dentry *debug_debugfs_root_entry;
diff --git a/arch/sh/boards/mach-landisk/gio.c b/arch/sh/boards/mach-landisk/gio.c
index 01e6abb769b9..8132dff078fb 100644
--- a/arch/sh/boards/mach-landisk/gio.c
+++ b/arch/sh/boards/mach-landisk/gio.c
@@ -128,6 +128,7 @@ static const struct file_operations gio_fops = {
 	.open = gio_open,	/* open */
 	.release = gio_close,	/* release */
 	.unlocked_ioctl = gio_ioctl,
+	.llseek = noop_llseek,
 };
 
 static int __init gio_init(void)
diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c
index 2c0046ecc715..52de4a9424e8 100644
--- a/arch/sparc/kernel/apc.c
+++ b/arch/sparc/kernel/apc.c
@@ -132,6 +132,7 @@ static const struct file_operations apc_fops = {
 	.unlocked_ioctl =	apc_ioctl,
 	.open =			apc_open,
 	.release =		apc_release,
+	.llseek =		noop_llseek,
 };
 
 static struct miscdevice apc_miscdev = { APC_MINOR, APC_DEVNAME, &apc_fops };
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 83e85c2e802a..6addb914fcc8 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -890,6 +890,7 @@ static ssize_t mdesc_read(struct file *file, char __user *buf,
 static const struct file_operations mdesc_fops = {
 	.read	= mdesc_read,
 	.owner	= THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice mdesc_misc = {
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 584b965dc824..1e54a7843410 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -774,6 +774,7 @@ static const struct file_operations dev_hardwall_fops = {
 #endif
 	.flush          = hardwall_flush,
 	.release        = hardwall_release,
+	.llseek		= noop_llseek,
 };
 
 static struct cdev hardwall_dev;
diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c
index cfcac1ff4cf2..dd1e6f871fe4 100644
--- a/arch/um/drivers/harddog_kern.c
+++ b/arch/um/drivers/harddog_kern.c
@@ -166,6 +166,7 @@ static const struct file_operations harddog_fops = {
 	.unlocked_ioctl	= harddog_ioctl,
 	.open		= harddog_open,
 	.release	= harddog_release,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice harddog_miscdev = {
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index ebc680717e59..975613b23dcf 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -843,6 +843,7 @@ static ssize_t mconsole_proc_write(struct file *file,
 static const struct file_operations mconsole_proc_fops = {
 	.owner		= THIS_MODULE,
 	.write		= mconsole_proc_write,
+	.llseek		= noop_llseek,
 };
 
 static int create_proc_mconsole(void)
diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c
index 7158393b6793..8501e7d0015c 100644
--- a/arch/um/drivers/mmapper_kern.c
+++ b/arch/um/drivers/mmapper_kern.c
@@ -93,6 +93,7 @@ static const struct file_operations mmapper_fops = {
 	.mmap		= mmapper_mmap,
 	.open		= mmapper_open,
 	.release	= mmapper_release,
+	.llseek		= default_llseek,
 };
 
 /*
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 4949044773ba..981085a93f30 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -100,6 +100,7 @@ static const struct file_operations rng_chrdev_ops = {
 	.owner		= THIS_MODULE,
 	.open		= rng_dev_open,
 	.read		= rng_dev_read,
+	.llseek		= noop_llseek,
 };
 
 /* rng_init shouldn't be called more than once at boot time */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b7..fbbc4dadecc4 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = {
 	.unlocked_ioctl	= do_ioctl,
 	.open		= do_open,
 	.release	= do_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice apm_device = {
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa1..1e8d66c1336a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
 	.release	= seq_release,
 	.read		= seq_read,
 	.write		= severities_coverage_write,
+	.llseek		= seq_lseek,
 };
 
 static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909fe..7a35b72d7c03 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = {
 	.read			= mce_read,
 	.poll			= mce_poll,
 	.unlocked_ioctl		= mce_ioctl,
+	.llseek		= no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
 
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..90fcf62854bb 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
 static const struct file_operations fops_setup_data = {
 	.read		= setup_data_read,
 	.open		= setup_data_open,
+	.llseek		= default_llseek,
 };
 
 static int __init
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c10..0b3d37e83606 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = {
 	.owner			= THIS_MODULE,
 	.write			= microcode_write,
 	.open			= microcode_open,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice microcode_dev = {
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 312ef0292815..50ac949c7f1c 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1285,6 +1285,7 @@ static const struct file_operations tunables_fops = {
 	.open		= tunables_open,
 	.read		= tunables_read,
 	.write		= tunables_write,
+	.llseek		= default_llseek,
 };
 
 static int __init uv_ptc_init(void)
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee5..7c0fedd98ea0 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
 	.open	= u32_array_open,
 	.release= xen_array_release,
 	.read	= u32_array_read,
+	.llseek = no_llseek,
 };
 
 struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
diff --git a/block/bsg.c b/block/bsg.c
index 82d58829ba59..83e381a26b58 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -968,6 +968,7 @@ static const struct file_operations bsg_fops = {
 	.release	=	bsg_release,
 	.unlocked_ioctl	=	bsg_ioctl,
 	.owner		=	THIS_MODULE,
+	.llseek		=	default_llseek,
 };
 
 void bsg_unregister_queue(struct request_queue *q)
diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
index 5281ddda2777..cbab9b07bf28 100644
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -180,6 +180,7 @@ static const struct file_operations erst_dbg_ops = {
 	.read		= erst_dbg_read,
 	.write		= erst_dbg_write,
 	.unlocked_ioctl	= erst_dbg_ioctl,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice erst_dbg_dev = {
diff --git a/drivers/acpi/debugfs.c b/drivers/acpi/debugfs.c
index 7de27d49c4b9..6355b575ee5a 100644
--- a/drivers/acpi/debugfs.c
+++ b/drivers/acpi/debugfs.c
@@ -69,6 +69,7 @@ static ssize_t cm_write(struct file *file, const char __user * user_buf,
 
 static const struct file_operations cm_fops = {
 	.write = cm_write,
+	.llseek = default_llseek,
 };
 
 int __init acpi_debugfs_init(void)
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index 0e869b3f81ca..411620ef84c2 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -101,6 +101,7 @@ static struct file_operations acpi_ec_io_ops = {
 	.open  = acpi_ec_open_io,
 	.read  = acpi_ec_read_io,
 	.write = acpi_ec_write_io,
+	.llseek = default_llseek,
 };
 
 int acpi_ec_add_debugfs(struct acpi_ec *ec, unsigned int ec_device_count)
diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index d439314a75d8..85d908993809 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -110,6 +110,7 @@ static const struct file_operations acpi_system_event_ops = {
 	.read = acpi_system_read_event,
 	.release = acpi_system_close_event,
 	.poll = acpi_system_poll_event,
+	.llseek = default_llseek,
 };
 #endif	/* CONFIG_ACPI_PROC_EVENT */
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 4e2c367fec11..dfcb33e8d405 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -7062,7 +7062,8 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
 
 static const struct file_operations DAC960_gam_fops = {
 	.owner		= THIS_MODULE,
-	.unlocked_ioctl	= DAC960_gam_ioctl
+	.unlocked_ioctl	= DAC960_gam_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice DAC960_gam_dev = {
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 4a1b9e7464aa..32b484ba21bd 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -265,6 +265,7 @@ static const struct file_operations aoe_fops = {
 	.open = aoechr_open,
 	.release = aoechr_rel,
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static char *aoe_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3ddba9b..aa27cd84f633 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -234,6 +234,7 @@ static const struct file_operations pg_fops = {
 	.write = pg_write,
 	.open = pg_open,
 	.release = pg_release,
+	.llseek = noop_llseek,
 };
 
 static void pg_init_units(void)
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index bc5825fdeaab..c372c32e0db3 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -239,6 +239,7 @@ static const struct file_operations pt_fops = {
 	.unlocked_ioctl = pt_ioctl,
 	.open = pt_open,
 	.release = pt_release,
+	.llseek = noop_llseek,
 };
 
 /* sysfs class support */
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index b1cbeb59bb76..6a4642dd8283 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -3046,6 +3046,7 @@ static const struct file_operations pkt_ctl_fops = {
 	.compat_ioctl	= pkt_ctl_compat_ioctl,
 #endif
 	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice pkt_misc = {
diff --git a/drivers/bluetooth/btmrvl_debugfs.c b/drivers/bluetooth/btmrvl_debugfs.c
index 54739b08c308..fd6305bf953e 100644
--- a/drivers/bluetooth/btmrvl_debugfs.c
+++ b/drivers/bluetooth/btmrvl_debugfs.c
@@ -92,6 +92,7 @@ static const struct file_operations btmrvl_hscfgcmd_fops = {
 	.read	= btmrvl_hscfgcmd_read,
 	.write	= btmrvl_hscfgcmd_write,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_psmode_write(struct file *file, const char __user *ubuf,
@@ -130,6 +131,7 @@ static const struct file_operations btmrvl_psmode_fops = {
 	.read	= btmrvl_psmode_read,
 	.write	= btmrvl_psmode_write,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_pscmd_write(struct file *file, const char __user *ubuf,
@@ -173,6 +175,7 @@ static const struct file_operations btmrvl_pscmd_fops = {
 	.read = btmrvl_pscmd_read,
 	.write = btmrvl_pscmd_write,
 	.open = btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_gpiogap_write(struct file *file, const char __user *ubuf,
@@ -211,6 +214,7 @@ static const struct file_operations btmrvl_gpiogap_fops = {
 	.read	= btmrvl_gpiogap_read,
 	.write	= btmrvl_gpiogap_write,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_hscmd_write(struct file *file, const char __user *ubuf,
@@ -252,6 +256,7 @@ static const struct file_operations btmrvl_hscmd_fops = {
 	.read	= btmrvl_hscmd_read,
 	.write	= btmrvl_hscmd_write,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_hsmode_write(struct file *file, const char __user *ubuf,
@@ -289,6 +294,7 @@ static const struct file_operations btmrvl_hsmode_fops = {
 	.read	= btmrvl_hsmode_read,
 	.write	= btmrvl_hsmode_write,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_curpsmode_read(struct file *file, char __user *userbuf,
@@ -306,6 +312,7 @@ static ssize_t btmrvl_curpsmode_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_curpsmode_fops = {
 	.read	= btmrvl_curpsmode_read,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_psstate_read(struct file *file, char __user * userbuf,
@@ -323,6 +330,7 @@ static ssize_t btmrvl_psstate_read(struct file *file, char __user * userbuf,
 static const struct file_operations btmrvl_psstate_fops = {
 	.read	= btmrvl_psstate_read,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_hsstate_read(struct file *file, char __user *userbuf,
@@ -340,6 +348,7 @@ static ssize_t btmrvl_hsstate_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_hsstate_fops = {
 	.read	= btmrvl_hsstate_read,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t btmrvl_txdnldready_read(struct file *file, char __user *userbuf,
@@ -358,6 +367,7 @@ static ssize_t btmrvl_txdnldready_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_txdnldready_fops = {
 	.read	= btmrvl_txdnldready_read,
 	.open	= btmrvl_open_generic,
+	.llseek = default_llseek,
 };
 
 void btmrvl_debugfs_init(struct hci_dev *hdev)
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index 3aa7b2a54b6f..67c180c2c1e0 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -282,6 +282,7 @@ static const struct file_operations vhci_fops = {
 	.poll		= vhci_poll,
 	.open		= vhci_open,
 	.release	= vhci_release,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice vhci_miscdev= {
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index 033e1505fca9..5ffa6904ea6b 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -402,6 +402,7 @@ static const struct file_operations apm_bios_fops = {
 	.unlocked_ioctl	= apm_ioctl,
 	.open		= apm_open,
 	.release	= apm_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice apm_device = {
diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c
index 836d4f0a876f..44660f1c4849 100644
--- a/drivers/char/bfin-otp.c
+++ b/drivers/char/bfin-otp.c
@@ -222,6 +222,7 @@ static const struct file_operations bfin_otp_fops = {
 	.unlocked_ioctl = bfin_otp_ioctl,
 	.read           = bfin_otp_read,
 	.write          = bfin_otp_write,
+	.llseek		= default_llseek,
 };
 
 static struct miscdevice bfin_otp_misc_device = {
diff --git a/drivers/char/briq_panel.c b/drivers/char/briq_panel.c
index d5fa113afe37..f6718f05dad4 100644
--- a/drivers/char/briq_panel.c
+++ b/drivers/char/briq_panel.c
@@ -186,6 +186,7 @@ static const struct file_operations briq_panel_fops = {
 	.write		= briq_panel_write,
 	.open		= briq_panel_open,
 	.release	= briq_panel_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice briq_panel_miscdev = {
diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c
index 91917133ae0a..a4a6c2f044b5 100644
--- a/drivers/char/bsr.c
+++ b/drivers/char/bsr.c
@@ -155,6 +155,7 @@ static const struct file_operations bsr_fops = {
 	.owner = THIS_MODULE,
 	.mmap  = bsr_mmap,
 	.open  = bsr_open,
+	.llseek = noop_llseek,
 };
 
 static void bsr_cleanup_devs(void)
diff --git a/drivers/char/cs5535_gpio.c b/drivers/char/cs5535_gpio.c
index 4d830dc482ef..0cf1e5fad9ab 100644
--- a/drivers/char/cs5535_gpio.c
+++ b/drivers/char/cs5535_gpio.c
@@ -169,7 +169,8 @@ static const struct file_operations cs5535_gpio_fops = {
 	.owner	= THIS_MODULE,
 	.write	= cs5535_gpio_write,
 	.read	= cs5535_gpio_read,
-	.open	= cs5535_gpio_open
+	.open	= cs5535_gpio_open,
+	.llseek = no_llseek,
 };
 
 static int __init cs5535_gpio_init(void)
diff --git a/drivers/char/ds1302.c b/drivers/char/ds1302.c
index 170693c93c73..4f7aa364167c 100644
--- a/drivers/char/ds1302.c
+++ b/drivers/char/ds1302.c
@@ -288,6 +288,7 @@ get_rtc_status(char *buf)
 static const struct file_operations rtc_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= rtc_ioctl,
+	.llseek		= noop_llseek,
 };
 
 /* Probe for the chip by writing something to its RAM and try reading it back. */
diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c
index dbee8688f75c..50462b63e51b 100644
--- a/drivers/char/ds1620.c
+++ b/drivers/char/ds1620.c
@@ -357,6 +357,7 @@ static const struct file_operations ds1620_fops = {
 	.open		= ds1620_open,
 	.read		= ds1620_read,
 	.unlocked_ioctl	= ds1620_unlocked_ioctl,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice ds1620_miscdev = {
diff --git a/drivers/char/dsp56k.c b/drivers/char/dsp56k.c
index 8a1b28a10ef0..353be4707d92 100644
--- a/drivers/char/dsp56k.c
+++ b/drivers/char/dsp56k.c
@@ -482,6 +482,7 @@ static const struct file_operations dsp56k_fops = {
 	.unlocked_ioctl	= dsp56k_ioctl,
 	.open		= dsp56k_open,
 	.release	= dsp56k_release,
+	.llseek		= noop_llseek,
 };
 
 
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index e3859d4eaead..007eca3ed77c 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -105,6 +105,7 @@ static const struct file_operations dtlk_fops =
 	.unlocked_ioctl	= dtlk_ioctl,
 	.open		= dtlk_open,
 	.release	= dtlk_release,
+	.llseek		= no_llseek,
 };
 
 /* local prototypes */
diff --git a/drivers/char/genrtc.c b/drivers/char/genrtc.c
index b6c2cc167c11..eaa0e4264e16 100644
--- a/drivers/char/genrtc.c
+++ b/drivers/char/genrtc.c
@@ -497,6 +497,7 @@ static const struct file_operations gen_rtc_fops = {
 	.unlocked_ioctl	= gen_rtc_unlocked_ioctl,
 	.open		= gen_rtc_open,
 	.release	= gen_rtc_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice rtc_gen_dev =
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 3d9c61e5acbf..788da05190cc 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -170,6 +170,7 @@ static const struct file_operations rng_chrdev_ops = {
 	.owner		= THIS_MODULE,
 	.open		= rng_dev_open,
 	.read		= rng_dev_read,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice rng_miscdev = {
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index d4b71e8d0d23..2e49e515d8e7 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -236,6 +236,7 @@ static const struct file_operations ip2_ipl = {
 	.write		= ip2_ipl_write,
 	.unlocked_ioctl	= ip2_ipl_ioctl,
 	.open		= ip2_ipl_open,
+	.llseek		= noop_llseek,
 }; 
 
 static unsigned long irq_counter;
diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index d8ec92a38980..c6709d6b0fd0 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -850,6 +850,7 @@ static const struct file_operations ipmi_fops = {
 	.release	= ipmi_release,
 	.fasync		= ipmi_fasync,
 	.poll		= ipmi_poll,
+	.llseek		= noop_llseek,
 };
 
 #define DEVICE_NAME     "ipmidev"
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 654d566ca57c..da041f88f64a 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -909,6 +909,7 @@ static const struct file_operations ipmi_wdog_fops = {
 	.open    = ipmi_open,
 	.release = ipmi_close,
 	.fasync  = ipmi_fasync,
+	.llseek  = no_llseek,
 };
 
 static struct miscdevice ipmi_wdog_miscdev = {
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index be28391adb79..667abd23ad6a 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -704,6 +704,7 @@ static const struct file_operations	stli_fsiomem = {
 	.read		= stli_memread,
 	.write		= stli_memwrite,
 	.unlocked_ioctl	= stli_memioctl,
+	.llseek		= default_llseek,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index 938a3a273886..d2344117eaf5 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -748,6 +748,7 @@ static const struct file_operations lp_fops = {
 #ifdef CONFIG_PARPORT_1284
 	.read		= lp_read,
 #endif
+	.llseek		= noop_llseek,
 };
 
 /* --- support for console on the line printer ----------------- */
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index a398ecdbd758..8ebd232132fd 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -804,6 +804,7 @@ static const struct file_operations full_fops = {
 static const struct file_operations oldmem_fops = {
 	.read	= read_oldmem,
 	.open	= open_oldmem,
+	.llseek = default_llseek,
 };
 #endif
 
@@ -830,6 +831,7 @@ static ssize_t kmsg_write(struct file *file, const char __user *buf,
 
 static const struct file_operations kmsg_fops = {
 	.write = kmsg_write,
+	.llseek = noop_llseek,
 };
 
 static const struct memdev {
@@ -881,6 +883,7 @@ static int memory_open(struct inode *inode, struct file *filp)
 
 static const struct file_operations memory_fops = {
 	.open = memory_open,
+	.llseek = noop_llseek,
 };
 
 static char *mem_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index abdafd488980..778273c93242 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -162,6 +162,7 @@ static struct class *misc_class;
 static const struct file_operations misc_fops = {
 	.owner		= THIS_MODULE,
 	.open		= misc_open,
+	.llseek		= noop_llseek,
 };
 
 /**
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index ea7c99fa978f..1c4070ced066 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -72,6 +72,7 @@ static const struct file_operations mmtimer_fops = {
 	.owner = THIS_MODULE,
 	.mmap =	mmtimer_mmap,
 	.unlocked_ioctl = mmtimer_ioctl,
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index ecb89d798e35..966a95bc974b 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -316,7 +316,8 @@ uncached_mmap(struct file *file, struct vm_area_struct *vma)
 
 static const struct file_operations fetchop_fops = {
 	.owner = THIS_MODULE,
-	.mmap = fetchop_mmap
+	.mmap = fetchop_mmap,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice fetchop_miscdev = {
@@ -327,7 +328,8 @@ static struct miscdevice fetchop_miscdev = {
 
 static const struct file_operations cached_fops = {
 	.owner = THIS_MODULE,
-	.mmap = cached_mmap
+	.mmap = cached_mmap,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice cached_miscdev = {
@@ -338,7 +340,8 @@ static struct miscdevice cached_miscdev = {
 
 static const struct file_operations uncached_fops = {
 	.owner = THIS_MODULE,
-	.mmap = uncached_mmap
+	.mmap = uncached_mmap,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice uncached_miscdev = {
diff --git a/drivers/char/mwave/mwavedd.c b/drivers/char/mwave/mwavedd.c
index a4ec50c95072..0464822eac53 100644
--- a/drivers/char/mwave/mwavedd.c
+++ b/drivers/char/mwave/mwavedd.c
@@ -479,7 +479,8 @@ static const struct file_operations mwave_fops = {
 	.write		= mwave_write,
 	.unlocked_ioctl	= mwave_ioctl,
 	.open		= mwave_open,
-	.release	= mwave_close
+	.release	= mwave_close,
+	.llseek		= default_llseek,
 };
 
 
diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c
index 2604246501e4..8994ce32e6c7 100644
--- a/drivers/char/nwbutton.c
+++ b/drivers/char/nwbutton.c
@@ -182,6 +182,7 @@ static int button_read (struct file *filp, char __user *buffer,
 static const struct file_operations button_fops = {
 	.owner		= THIS_MODULE,
 	.read		= button_read,
+	.llseek		= noop_llseek,
 };
 
 /* 
diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c
index 8ecbcc174c15..b304ec052501 100644
--- a/drivers/char/pc8736x_gpio.c
+++ b/drivers/char/pc8736x_gpio.c
@@ -234,6 +234,7 @@ static const struct file_operations pc8736x_gpio_fileops = {
 	.open	= pc8736x_gpio_open,
 	.write	= nsc_gpio_write,
 	.read	= nsc_gpio_read,
+	.llseek = no_llseek,
 };
 
 static void __init pc8736x_init_shadow(void)
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index ec73d9f6d9ed..c99f6997e5e7 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1880,6 +1880,7 @@ static const struct file_operations cm4000_fops = {
 	.unlocked_ioctl	= cmm_ioctl,
 	.open	= cmm_open,
 	.release= cmm_close,
+	.llseek = no_llseek,
 };
 
 static struct pcmcia_device_id cm4000_ids[] = {
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index 815cde1d0570..9ecc58baa8f3 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -650,6 +650,7 @@ static const struct file_operations reader_fops = {
 	.open		= cm4040_open,
 	.release	= cm4040_close,
 	.poll		= cm4040_poll,
+	.llseek		= no_llseek,
 };
 
 static struct pcmcia_device_id cm4040_ids[] = {
diff --git a/drivers/char/random.c b/drivers/char/random.c
index caef35a46890..5a1aa64f4e76 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1165,6 +1165,7 @@ const struct file_operations random_fops = {
 	.poll  = random_poll,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
+	.llseek = noop_llseek,
 };
 
 const struct file_operations urandom_fops = {
@@ -1172,6 +1173,7 @@ const struct file_operations urandom_fops = {
 	.write = random_write,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
+	.llseek = noop_llseek,
 };
 
 /***************************************************************
diff --git a/drivers/char/rio/rio_linux.c b/drivers/char/rio/rio_linux.c
index d58c2eb07f07..1e87a93164bf 100644
--- a/drivers/char/rio/rio_linux.c
+++ b/drivers/char/rio/rio_linux.c
@@ -241,6 +241,7 @@ static struct real_driver rio_real_driver = {
 static const struct file_operations rio_fw_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = rio_fw_ioctl,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice rio_fw_device = {
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index 99e5272e3c53..0bc135b9b16f 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -67,6 +67,7 @@ static const struct file_operations scx200_gpio_fileops = {
 	.read    = nsc_gpio_read,
 	.open    = scx200_gpio_open,
 	.release = scx200_gpio_release,
+	.llseek  = no_llseek,
 };
 
 static struct cdev scx200_gpio_cdev;  /* use 1 cdev for all pins */
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index 32b74de18f5f..444ce17ac722 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -357,6 +357,7 @@ static const struct file_operations scdrv_fops = {
 	.poll =		scdrv_poll,
 	.open =		scdrv_open,
 	.release =	scdrv_release,
+	.llseek =	noop_llseek,
 };
 
 static struct class *snsc_class;
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index f2167f8e5aab..8ef16490810c 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -608,6 +608,7 @@ static unsigned int	sc26198_baudtable[] = {
 static const struct file_operations	stl_fsiomem = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= stl_memioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct class *stallion_class;
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 5b24db4ff7f1..e53f16865397 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -397,6 +397,7 @@ static struct real_driver sx_real_driver = {
 static const struct file_operations sx_fw_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = sx_fw_ioctl,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice sx_fw_device = {
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ef31bb81e843..f3019f53e875 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -772,6 +772,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
 
 static const struct file_operations proc_sysrq_trigger_operations = {
 	.write		= write_sysrq_trigger,
+	.llseek		= noop_llseek,
 };
 
 static void sysrq_init_procfs(void)
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index cad4eb65f13d..ad264185eb10 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -261,6 +261,7 @@ static const struct file_operations tb0219_fops = {
 	.write		= tanbac_tb0219_write,
 	.open		= tanbac_tb0219_open,
 	.release	= tanbac_tb0219_release,
+	.llseek		= no_llseek,
 };
 
 static void tb0219_restart(char *command)
diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c
index 80ea6bcfffdc..d087532b29d7 100644
--- a/drivers/char/tlclk.c
+++ b/drivers/char/tlclk.c
@@ -267,6 +267,7 @@ static const struct file_operations tlclk_fops = {
 	.read = tlclk_read,
 	.open = tlclk_open,
 	.release = tlclk_release,
+	.llseek = noop_llseek,
 
 };
 
diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c
index f8bc79f6de34..3d6e96172453 100644
--- a/drivers/char/toshiba.c
+++ b/drivers/char/toshiba.c
@@ -95,6 +95,7 @@ static long tosh_ioctl(struct file *, unsigned int,
 static const struct file_operations tosh_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= tosh_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice tosh_device = {
diff --git a/drivers/char/uv_mmtimer.c b/drivers/char/uv_mmtimer.c
index c7072ba14f48..493b47a0d511 100644
--- a/drivers/char/uv_mmtimer.c
+++ b/drivers/char/uv_mmtimer.c
@@ -52,6 +52,7 @@ static const struct file_operations uv_mmtimer_fops = {
 	.owner = THIS_MODULE,
 	.mmap =	uv_mmtimer_mmap,
 	.unlocked_ioctl = uv_mmtimer_ioctl,
+	.llseek = noop_llseek,
 };
 
 /**
diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
index b663d573aad9..be6d6fb47cc5 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
@@ -567,6 +567,7 @@ static const struct file_operations hwicap_fops = {
 	.read = hwicap_read,
 	.open = hwicap_open,
 	.release = hwicap_release,
+	.llseek = noop_llseek,
 };
 
 static int __devinit hwicap_setup(struct device *dev, int id,
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
index 557e2272e5b3..ae2b8714d190 100644
--- a/drivers/dma/coh901318.c
+++ b/drivers/dma/coh901318.c
@@ -157,6 +157,7 @@ static const struct file_operations coh901318_debugfs_status_operations = {
 	.owner		= THIS_MODULE,
 	.open		= coh901318_debugfs_open,
 	.read		= coh901318_debugfs_read,
+	.llseek		= default_llseek,
 };
 
 
diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c
index 8528b10763ed..bf184fb59a5e 100644
--- a/drivers/firewire/nosy.c
+++ b/drivers/firewire/nosy.c
@@ -405,6 +405,7 @@ static const struct file_operations nosy_ops = {
 	.poll =			nosy_poll,
 	.open =			nosy_open,
 	.release =		nosy_release,
+	.llseek =		noop_llseek,
 };
 
 #define PHY_PACKET_SIZE 12 /* 1 payload, 1 inverse, 1 ack = 3 quadlets */
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 84da748555bc..ff6690f4fc87 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -284,7 +284,8 @@ EXPORT_SYMBOL(drm_exit);
 /** File operations structure */
 static const struct file_operations drm_stub_fops = {
 	.owner = THIS_MODULE,
-	.open = drm_stub_open
+	.open = drm_stub_open,
+	.llseek = noop_llseek,
 };
 
 static int __init drm_core_init(void)
diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c
index 61b4caf220fa..00f1bdaa65cd 100644
--- a/drivers/gpu/drm/i810/i810_dma.c
+++ b/drivers/gpu/drm/i810/i810_dma.c
@@ -119,6 +119,7 @@ static const struct file_operations i810_buffer_fops = {
 	.unlocked_ioctl = drm_ioctl,
 	.mmap = i810_mmap_buffers,
 	.fasync = drm_fasync,
+	.llseek = noop_llseek,
 };
 
 static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
diff --git a/drivers/gpu/drm/i830/i830_dma.c b/drivers/gpu/drm/i830/i830_dma.c
index 671aa18415ac..5c6eb65f4e51 100644
--- a/drivers/gpu/drm/i830/i830_dma.c
+++ b/drivers/gpu/drm/i830/i830_dma.c
@@ -121,6 +121,7 @@ static const struct file_operations i830_buffer_fops = {
 	.unlocked_ioctl = drm_ioctl,
 	.mmap = i830_mmap_buffers,
 	.fasync = drm_fasync,
+	.llseek = noop_llseek,
 };
 
 static int i830_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 5e43d7076789..048149748fdc 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -782,6 +782,7 @@ static const struct file_operations i915_wedged_fops = {
 	.open = i915_wedged_open,
 	.read = i915_wedged_read,
 	.write = i915_wedged_write,
+	.llseek = default_llseek,
 };
 
 /* As the drm_debugfs_init() routines are called before dev->dev_private is
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index b87569e96b16..3b7e0acf8164 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -1211,6 +1211,7 @@ static const struct file_operations vga_arb_device_fops = {
 	.poll = vga_arb_fpoll,
 	.open = vga_arb_open,
 	.release = vga_arb_release,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice vga_arb_device = {
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 850d02a7a925..61a3e572224a 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -1051,6 +1051,7 @@ static const struct file_operations hid_debug_events_fops = {
 	.read           = hid_debug_events_read,
 	.poll		= hid_debug_events_poll,
 	.release        = hid_debug_events_release,
+	.llseek		= noop_llseek,
 };
 
 
diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index f6e80c7ca61e..5a6879e235ac 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -384,6 +384,7 @@ static const struct file_operations roccat_ops = {
 	.poll = roccat_poll,
 	.open = roccat_open,
 	.release = roccat_release,
+	.llseek = noop_llseek,
 };
 
 static int __init roccat_init(void)
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 47d70c523d93..bb98fa87aa86 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -329,6 +329,7 @@ static const struct file_operations hidraw_ops = {
 	.open =         hidraw_open,
 	.release =      hidraw_release,
 	.unlocked_ioctl = hidraw_ioctl,
+	.llseek =	noop_llseek,
 };
 
 void hidraw_report_event(struct hid_device *hid, u8 *data, int len)
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 0a29c51114aa..e8f45d7ef3b2 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -847,6 +847,7 @@ static const struct file_operations hiddev_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= hiddev_compat_ioctl,
 #endif
+	.llseek		= noop_llseek,
 };
 
 static char *hiddev_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c
index 653db1bda934..23b8555215d2 100644
--- a/drivers/hwmon/asus_atk0110.c
+++ b/drivers/hwmon/asus_atk0110.c
@@ -762,6 +762,7 @@ static const struct file_operations atk_debugfs_ggrp_fops = {
 	.read		= atk_debugfs_ggrp_read,
 	.open		= atk_debugfs_ggrp_open,
 	.release	= atk_debugfs_ggrp_release,
+	.llseek		= no_llseek,
 };
 
 static void atk_debugfs_init(struct atk_data *data)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 6d622cb5ac81..23d1d1c5587c 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1903,6 +1903,7 @@ static const struct file_operations idetape_fops = {
 	.unlocked_ioctl	= idetape_chrdev_ioctl,
 	.open		= idetape_chrdev_open,
 	.release	= idetape_chrdev_release,
+	.llseek		= noop_llseek,
 };
 
 static int idetape_open(struct block_device *bdev, fmode_t mode)
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index 15341fc1c68b..c976285d313e 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -536,6 +536,7 @@ static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count,
 static const struct file_operations idle_fops = {
 	.open	= stats_open_generic,
 	.read	= stats_read_ul,
+	.llseek = default_llseek,
 };
 
 struct debugfs_file_info {
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
index d4ce8b63e19e..daef61d5e5bb 100644
--- a/drivers/infiniband/hw/ipath/ipath_diag.c
+++ b/drivers/infiniband/hw/ipath/ipath_diag.c
@@ -65,7 +65,8 @@ static const struct file_operations diag_file_ops = {
 	.write = ipath_diag_write,
 	.read = ipath_diag_read,
 	.open = ipath_diag_open,
-	.release = ipath_diag_release
+	.release = ipath_diag_release,
+	.llseek = default_llseek,
 };
 
 static ssize_t ipath_diagpkt_write(struct file *fp,
@@ -75,6 +76,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp,
 static const struct file_operations diagpkt_file_ops = {
 	.owner = THIS_MODULE,
 	.write = ipath_diagpkt_write,
+	.llseek = noop_llseek,
 };
 
 static atomic_t diagpkt_count = ATOMIC_INIT(0);
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 65eb8929db22..6078992da3f0 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -63,7 +63,8 @@ static const struct file_operations ipath_file_ops = {
 	.open = ipath_open,
 	.release = ipath_close,
 	.poll = ipath_poll,
-	.mmap = ipath_mmap
+	.mmap = ipath_mmap,
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 2fca70836dae..d13e72685dcf 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -103,6 +103,7 @@ static ssize_t atomic_stats_read(struct file *file, char __user *buf,
 
 static const struct file_operations atomic_stats_ops = {
 	.read = atomic_stats_read,
+	.llseek = default_llseek,
 };
 
 static ssize_t atomic_counters_read(struct file *file, char __user *buf,
@@ -120,6 +121,7 @@ static ssize_t atomic_counters_read(struct file *file, char __user *buf,
 
 static const struct file_operations atomic_counters_ops = {
 	.read = atomic_counters_read,
+	.llseek = default_llseek,
 };
 
 static ssize_t flash_read(struct file *file, char __user *buf,
@@ -224,6 +226,7 @@ bail:
 static const struct file_operations flash_ops = {
 	.read = flash_read,
 	.write = flash_write,
+	.llseek = default_llseek,
 };
 
 static int create_device_files(struct super_block *sb,
diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c
index 05dcf0d9a7d3..204c4dd9dce0 100644
--- a/drivers/infiniband/hw/qib/qib_diag.c
+++ b/drivers/infiniband/hw/qib/qib_diag.c
@@ -136,7 +136,8 @@ static const struct file_operations diag_file_ops = {
 	.write = qib_diag_write,
 	.read = qib_diag_read,
 	.open = qib_diag_open,
-	.release = qib_diag_release
+	.release = qib_diag_release,
+	.llseek = default_llseek,
 };
 
 static atomic_t diagpkt_count = ATOMIC_INIT(0);
@@ -149,6 +150,7 @@ static ssize_t qib_diagpkt_write(struct file *fp, const char __user *data,
 static const struct file_operations diagpkt_file_ops = {
 	.owner = THIS_MODULE,
 	.write = qib_diagpkt_write,
+	.llseek = noop_llseek,
 };
 
 int qib_diag_add(struct qib_devdata *dd)
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 6b11645edf35..aa2be214270f 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -63,7 +63,8 @@ static const struct file_operations qib_file_ops = {
 	.open = qib_open,
 	.release = qib_close,
 	.poll = qib_poll,
-	.mmap = qib_mmapf
+	.mmap = qib_mmapf,
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 9f989c0ba9d3..a0e6613e8be6 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -367,6 +367,7 @@ bail:
 static const struct file_operations flash_ops = {
 	.read = flash_read,
 	.write = flash_write,
+	.llseek = default_llseek,
 };
 
 static int add_cntr_files(struct super_block *sb, struct qib_devdata *dd)
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index c908c5f83645..51330363c0eb 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -761,7 +761,8 @@ static const struct file_operations evdev_fops = {
 	.compat_ioctl	= evdev_ioctl_compat,
 #endif
 	.fasync		= evdev_fasync,
-	.flush		= evdev_flush
+	.flush		= evdev_flush,
+	.llseek		= no_llseek,
 };
 
 static int evdev_install_chrdev(struct evdev *evdev)
diff --git a/drivers/input/input.c b/drivers/input/input.c
index ab6982056518..7919c2537225 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -2047,6 +2047,7 @@ out:
 static const struct file_operations input_fops = {
 	.owner = THIS_MODULE,
 	.open = input_open_file,
+	.llseek = noop_llseek,
 };
 
 static int __init input_init(void)
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index d85bd8a7967d..502b2f73b434 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -736,6 +736,7 @@ static const struct file_operations joydev_fops = {
 	.compat_ioctl	= joydev_compat_ioctl,
 #endif
 	.fasync		= joydev_fasync,
+	.llseek		= no_llseek,
 };
 
 static int joydev_install_chrdev(struct joydev *joydev)
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 0d4266a533a5..2771ea778d34 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -804,6 +804,7 @@ static const struct file_operations uinput_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= uinput_compat_ioctl,
 #endif
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice uinput_misc = {
diff --git a/drivers/input/mousedev.c b/drivers/input/mousedev.c
index d528a2dba064..31ec7265aac6 100644
--- a/drivers/input/mousedev.c
+++ b/drivers/input/mousedev.c
@@ -792,6 +792,7 @@ static const struct file_operations mousedev_fops = {
 	.open =		mousedev_open,
 	.release =	mousedev_release,
 	.fasync =	mousedev_fasync,
+	.llseek = noop_llseek,
 };
 
 static int mousedev_install_chrdev(struct mousedev *mousedev)
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index 998664854440..cd82bb125915 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -243,6 +243,7 @@ static const struct file_operations serio_raw_fops = {
 	.write =	serio_raw_write,
 	.poll =		serio_raw_poll,
 	.fasync =	serio_raw_fasync,
+	.llseek = noop_llseek,
 };
 
 
diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index de43c8c70ad0..859c81e9483b 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -267,6 +267,7 @@ static const struct file_operations mISDN_fops = {
 	.unlocked_ioctl	= mISDN_ioctl,
 	.open		= mISDN_open,
 	.release	= mISDN_close,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice mISDNtimer = {
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 85b714df8eae..3c781cdddda9 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -514,6 +514,7 @@ static const struct file_operations lguest_fops = {
 	.release = close,
 	.write	 = write,
 	.read	 = read,
+	.llseek  = default_llseek,
 };
 
 /*
diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c
index a3d25da2f275..1a57e88a38f7 100644
--- a/drivers/macintosh/ans-lcd.c
+++ b/drivers/macintosh/ans-lcd.c
@@ -137,6 +137,7 @@ const struct file_operations anslcd_fops = {
 	.write		= anslcd_write,
 	.unlocked_ioctl	= anslcd_ioctl,
 	.open		= anslcd_open,
+	.llseek		= default_llseek,
 };
 
 static struct miscdevice anslcd_dev = {
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 2d17e76066bd..44d171cc2252 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2398,6 +2398,7 @@ static const struct file_operations pmu_device_fops = {
 #endif
 	.open		= pmu_open,
 	.release	= pmu_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice pmu_device = {
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3e39193e5036..4b54618b4159 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1596,6 +1596,7 @@ static const struct file_operations _ctl_fops = {
 	.unlocked_ioctl	 = dm_ctl_ioctl,
 	.compat_ioctl = dm_compat_ctl_ioctl,
 	.owner	 = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice _dm_misc = {
diff --git a/drivers/media/IR/imon.c b/drivers/media/IR/imon.c
index c185422ef28c..faed5a332c71 100644
--- a/drivers/media/IR/imon.c
+++ b/drivers/media/IR/imon.c
@@ -151,7 +151,8 @@ static const struct file_operations vfd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= &display_open,
 	.write		= &vfd_write,
-	.release	= &display_close
+	.release	= &display_close,
+	.llseek		= noop_llseek,
 };
 
 /* lcd character device file operations */
@@ -159,7 +160,8 @@ static const struct file_operations lcd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= &display_open,
 	.write		= &lcd_write,
-	.release	= &display_close
+	.release	= &display_close,
+	.llseek		= noop_llseek,
 };
 
 enum {
diff --git a/drivers/media/IR/lirc_dev.c b/drivers/media/IR/lirc_dev.c
index 5b145e8672fe..0acf6396e068 100644
--- a/drivers/media/IR/lirc_dev.c
+++ b/drivers/media/IR/lirc_dev.c
@@ -163,6 +163,7 @@ static struct file_operations fops = {
 	.unlocked_ioctl	= lirc_dev_fop_ioctl,
 	.open		= lirc_dev_fop_open,
 	.release	= lirc_dev_fop_close,
+	.llseek		= noop_llseek,
 };
 
 static int lirc_cdev_add(struct irctl *ir)
diff --git a/drivers/media/dvb/bt8xx/dst_ca.c b/drivers/media/dvb/bt8xx/dst_ca.c
index cf8705162845..7ed74dfc6b92 100644
--- a/drivers/media/dvb/bt8xx/dst_ca.c
+++ b/drivers/media/dvb/bt8xx/dst_ca.c
@@ -694,7 +694,8 @@ static const struct file_operations dst_ca_fops = {
 	.open = dst_ca_open,
 	.release = dst_ca_release,
 	.read = dst_ca_read,
-	.write = dst_ca_write
+	.write = dst_ca_write,
+	.llseek = noop_llseek,
 };
 
 static struct dvb_device dvbdev_ca = {
diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c
index 0042306ea11b..75c20ac82c0f 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.c
+++ b/drivers/media/dvb/dvb-core/dmxdev.c
@@ -1150,6 +1150,7 @@ static const struct file_operations dvb_demux_fops = {
 	.open = dvb_demux_open,
 	.release = dvb_demux_release,
 	.poll = dvb_demux_poll,
+	.llseek = default_llseek,
 };
 
 static struct dvb_device dvbdev_demux = {
@@ -1225,6 +1226,7 @@ static const struct file_operations dvb_dvr_fops = {
 	.open = dvb_dvr_open,
 	.release = dvb_dvr_release,
 	.poll = dvb_dvr_poll,
+	.llseek = default_llseek,
 };
 
 static struct dvb_device dvbdev_dvr = {
diff --git a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
index cb97e6b85432..ff8921dd737f 100644
--- a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
@@ -1628,6 +1628,7 @@ static const struct file_operations dvb_ca_fops = {
 	.open = dvb_ca_en50221_io_open,
 	.release = dvb_ca_en50221_io_release,
 	.poll = dvb_ca_en50221_io_poll,
+	.llseek = noop_llseek,
 };
 
 static struct dvb_device dvbdev_ca = {
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index 4d45b7d6b3fb..970c9b8882d4 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -2034,7 +2034,8 @@ static const struct file_operations dvb_frontend_fops = {
 	.unlocked_ioctl	= dvb_generic_ioctl,
 	.poll		= dvb_frontend_poll,
 	.open		= dvb_frontend_open,
-	.release	= dvb_frontend_release
+	.release	= dvb_frontend_release,
+	.llseek		= noop_llseek,
 };
 
 int dvb_register_frontend(struct dvb_adapter* dvb,
diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c
index 6c3a8a06ccab..82636f517b9e 100644
--- a/drivers/media/dvb/dvb-core/dvb_net.c
+++ b/drivers/media/dvb/dvb-core/dvb_net.c
@@ -1475,6 +1475,7 @@ static const struct file_operations dvb_net_fops = {
 	.unlocked_ioctl = dvb_net_ioctl,
 	.open =	dvb_generic_open,
 	.release = dvb_net_close,
+	.llseek = noop_llseek,
 };
 
 static struct dvb_device dvbdev_net = {
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index b915c39d782f..774b40e4f589 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -105,6 +105,7 @@ static const struct file_operations dvb_device_fops =
 {
 	.owner =	THIS_MODULE,
 	.open =		dvb_device_open,
+	.llseek =	noop_llseek,
 };
 
 static struct cdev dvb_device_cdev;
diff --git a/drivers/media/dvb/firewire/firedtv-ci.c b/drivers/media/dvb/firewire/firedtv-ci.c
index d3c2cf60de76..8ffb565f0704 100644
--- a/drivers/media/dvb/firewire/firedtv-ci.c
+++ b/drivers/media/dvb/firewire/firedtv-ci.c
@@ -220,6 +220,7 @@ static const struct file_operations fdtv_ca_fops = {
 	.open		= dvb_generic_open,
 	.release	= dvb_generic_release,
 	.poll		= fdtv_ca_io_poll,
+	.llseek		= noop_llseek,
 };
 
 static struct dvb_device fdtv_ca = {
diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c
index a6be529eec5c..893fbc57c72f 100644
--- a/drivers/media/dvb/ttpci/av7110.c
+++ b/drivers/media/dvb/ttpci/av7110.c
@@ -730,6 +730,7 @@ static const struct file_operations dvb_osd_fops = {
 	.unlocked_ioctl	= dvb_generic_ioctl,
 	.open		= dvb_generic_open,
 	.release	= dvb_generic_release,
+	.llseek		= noop_llseek,
 };
 
 static struct dvb_device dvbdev_osd = {
diff --git a/drivers/media/dvb/ttpci/av7110_av.c b/drivers/media/dvb/ttpci/av7110_av.c
index 13efba942dac..6ef3996565ad 100644
--- a/drivers/media/dvb/ttpci/av7110_av.c
+++ b/drivers/media/dvb/ttpci/av7110_av.c
@@ -1521,6 +1521,7 @@ static const struct file_operations dvb_video_fops = {
 	.open		= dvb_video_open,
 	.release	= dvb_video_release,
 	.poll		= dvb_video_poll,
+	.llseek		= noop_llseek,
 };
 
 static struct dvb_device dvbdev_video = {
@@ -1539,6 +1540,7 @@ static const struct file_operations dvb_audio_fops = {
 	.open		= dvb_audio_open,
 	.release	= dvb_audio_release,
 	.poll		= dvb_audio_poll,
+	.llseek		= noop_llseek,
 };
 
 static struct dvb_device dvbdev_audio = {
diff --git a/drivers/media/dvb/ttpci/av7110_ca.c b/drivers/media/dvb/ttpci/av7110_ca.c
index 4eba35a018e3..43f61f2eca98 100644
--- a/drivers/media/dvb/ttpci/av7110_ca.c
+++ b/drivers/media/dvb/ttpci/av7110_ca.c
@@ -353,6 +353,7 @@ static const struct file_operations dvb_ca_fops = {
 	.open		= dvb_ca_open,
 	.release	= dvb_generic_release,
 	.poll		= dvb_ca_poll,
+	.llseek		= default_llseek,
 };
 
 static struct dvb_device dvbdev_ca = {
diff --git a/drivers/media/dvb/ttpci/av7110_ir.c b/drivers/media/dvb/ttpci/av7110_ir.c
index b070e88d8c6b..908f272fe26c 100644
--- a/drivers/media/dvb/ttpci/av7110_ir.c
+++ b/drivers/media/dvb/ttpci/av7110_ir.c
@@ -312,6 +312,7 @@ static ssize_t av7110_ir_proc_write(struct file *file, const char __user *buffer
 static const struct file_operations av7110_ir_proc_fops = {
 	.owner		= THIS_MODULE,
 	.write		= av7110_ir_proc_write,
+	.llseek		= noop_llseek,
 };
 
 /* interrupt handler */
diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index 66379b413906..b048ecc56db9 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -583,6 +583,7 @@ static ssize_t ab3100_get_set_reg(struct file *file,
 static const struct file_operations ab3100_get_set_reg_fops = {
 	.open = ab3100_get_set_reg_open_file,
 	.write = ab3100_get_set_reg,
+	.llseek = noop_llseek,
 };
 
 static struct dentry *ab3100_dir;
diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index 557a8c2a7336..69c1f2fca141 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -640,6 +640,7 @@ static const struct file_operations ilo_fops = {
 	.poll		= ilo_poll,
 	.open 		= ilo_open,
 	.release 	= ilo_close,
+	.llseek		= noop_llseek,
 };
 
 static irqreturn_t ilo_isr(int irq, void *data)
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index 75ee0d3f6f45..6b38b5964294 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -279,6 +279,7 @@ static const struct file_operations phantom_file_ops = {
 	.unlocked_ioctl = phantom_ioctl,
 	.compat_ioctl = phantom_compat_ioctl,
 	.poll = phantom_poll,
+	.llseek = no_llseek,
 };
 
 static irqreturn_t phantom_isr(int irq, void *data)
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index cb3b4d228475..28852dfa310d 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -587,6 +587,7 @@ static const struct file_operations gru_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= gru_file_unlocked_ioctl,
 	.mmap		= gru_file_mmap,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice gru_miscdev = {
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 53cb380c0987..46bc6d7551a3 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -245,6 +245,7 @@ static const struct file_operations mmc_dbg_ext_csd_fops = {
 	.open		= mmc_ext_csd_open,
 	.read		= mmc_ext_csd_read,
 	.release	= mmc_ext_csd_release,
+	.llseek		= default_llseek,
 };
 
 void mmc_add_card_debugfs(struct mmc_card *card)
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 3d2d1a69e9a0..af9fb0ff8210 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -1100,4 +1100,5 @@ const struct file_operations ubi_ctrl_cdev_operations = {
 	.owner          = THIS_MODULE,
 	.unlocked_ioctl = ctrl_cdev_ioctl,
 	.compat_ioctl   = ctrl_cdev_compat_ioctl,
+	.llseek		= noop_llseek,
 };
diff --git a/drivers/net/caif/caif_spi.c b/drivers/net/caif/caif_spi.c
index f5058ff2b210..8427533fe313 100644
--- a/drivers/net/caif/caif_spi.c
+++ b/drivers/net/caif/caif_spi.c
@@ -240,13 +240,15 @@ static ssize_t dbgfs_frame(struct file *file, char __user *user_buf,
 static const struct file_operations dbgfs_state_fops = {
 	.open = dbgfs_open,
 	.read = dbgfs_state,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations dbgfs_frame_fops = {
 	.open = dbgfs_open,
 	.read = dbgfs_frame,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static inline void dev_debugfs_add(struct cfspi *cfspi)
diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c
index c327527fbbc8..e2bf10d90add 100644
--- a/drivers/net/cxgb4/cxgb4_main.c
+++ b/drivers/net/cxgb4/cxgb4_main.c
@@ -2026,6 +2026,7 @@ static const struct file_operations mem_debugfs_fops = {
 	.owner   = THIS_MODULE,
 	.open    = mem_open,
 	.read    = mem_read,
+	.llseek  = default_llseek,
 };
 
 static void __devinit add_debugfs_mem(struct adapter *adap, const char *name,
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 6695a51e09e9..323e81e9e808 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -856,7 +856,8 @@ static const struct file_operations ppp_device_fops = {
 	.poll		= ppp_poll,
 	.unlocked_ioctl	= ppp_ioctl,
 	.open		= ppp_open,
-	.release	= ppp_release
+	.release	= ppp_release,
+	.llseek		= noop_llseek,
 };
 
 static __net_init int ppp_init_net(struct net *net)
diff --git a/drivers/net/wimax/i2400m/debugfs.c b/drivers/net/wimax/i2400m/debugfs.c
index b1aec3e1892f..9c70b5fa3f51 100644
--- a/drivers/net/wimax/i2400m/debugfs.c
+++ b/drivers/net/wimax/i2400m/debugfs.c
@@ -119,6 +119,7 @@ const struct file_operations i2400m_rx_stats_fops = {
 	.open =		i2400m_stats_open,
 	.read =		i2400m_rx_stats_read,
 	.write =	i2400m_rx_stats_write,
+	.llseek =	default_llseek,
 };
 
 
@@ -171,6 +172,7 @@ const struct file_operations i2400m_tx_stats_fops = {
 	.open =		i2400m_stats_open,
 	.read =		i2400m_tx_stats_read,
 	.write =	i2400m_tx_stats_write,
+	.llseek =	default_llseek,
 };
 
 
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 1d05445d4ba3..ce77575e88b3 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -4430,21 +4430,24 @@ static const struct file_operations proc_statsdelta_ops = {
 	.owner		= THIS_MODULE,
 	.read		= proc_read,
 	.open		= proc_statsdelta_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_stats_ops = {
 	.owner		= THIS_MODULE,
 	.read		= proc_read,
 	.open		= proc_stats_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_status_ops = {
 	.owner		= THIS_MODULE,
 	.read		= proc_read,
 	.open		= proc_status_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_SSID_ops = {
@@ -4452,7 +4455,8 @@ static const struct file_operations proc_SSID_ops = {
 	.read		= proc_read,
 	.write		= proc_write,
 	.open		= proc_SSID_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_BSSList_ops = {
@@ -4460,7 +4464,8 @@ static const struct file_operations proc_BSSList_ops = {
 	.read		= proc_read,
 	.write		= proc_write,
 	.open		= proc_BSSList_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_APList_ops = {
@@ -4468,7 +4473,8 @@ static const struct file_operations proc_APList_ops = {
 	.read		= proc_read,
 	.write		= proc_write,
 	.open		= proc_APList_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_config_ops = {
@@ -4476,7 +4482,8 @@ static const struct file_operations proc_config_ops = {
 	.read		= proc_read,
 	.write		= proc_write,
 	.open		= proc_config_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_wepkey_ops = {
@@ -4484,7 +4491,8 @@ static const struct file_operations proc_wepkey_ops = {
 	.read		= proc_read,
 	.write		= proc_write,
 	.open		= proc_wepkey_open,
-	.release	= proc_close
+	.release	= proc_close,
+	.llseek		= default_llseek,
 };
 
 static struct proc_dir_entry *airo_entry;
diff --git a/drivers/net/wireless/ath/ath5k/debug.c b/drivers/net/wireless/ath/ath5k/debug.c
index 4cccc29964f6..fb339c3852ee 100644
--- a/drivers/net/wireless/ath/ath5k/debug.c
+++ b/drivers/net/wireless/ath/ath5k/debug.c
@@ -271,6 +271,7 @@ static const struct file_operations fops_beacon = {
 	.write = write_file_beacon,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -290,6 +291,7 @@ static const struct file_operations fops_reset = {
 	.write = write_file_reset,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 
@@ -369,6 +371,7 @@ static const struct file_operations fops_debug = {
 	.write = write_file_debug,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -480,6 +483,7 @@ static const struct file_operations fops_antenna = {
 	.write = write_file_antenna,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -591,6 +595,7 @@ static const struct file_operations fops_frameerrors = {
 	.write = write_file_frameerrors,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -748,6 +753,7 @@ static const struct file_operations fops_ani = {
 	.write = write_file_ani,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -811,6 +817,7 @@ static const struct file_operations fops_queue = {
 	.write = write_file_queue,
 	.open = ath5k_debugfs_open,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index 54aae931424e..cf500bf25ad5 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -71,7 +71,8 @@ static const struct file_operations fops_debug = {
 	.read = read_file_debug,
 	.write = write_file_debug,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 #endif
@@ -116,7 +117,8 @@ static const struct file_operations fops_tx_chainmask = {
 	.read = read_file_tx_chainmask,
 	.write = write_file_tx_chainmask,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -158,7 +160,8 @@ static const struct file_operations fops_rx_chainmask = {
 	.read = read_file_rx_chainmask,
 	.write = write_file_rx_chainmask,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -259,7 +262,8 @@ static ssize_t read_file_dma(struct file *file, char __user *user_buf,
 static const struct file_operations fops_dma = {
 	.read = read_file_dma,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 
@@ -375,7 +379,8 @@ static ssize_t read_file_interrupt(struct file *file, char __user *user_buf,
 static const struct file_operations fops_interrupt = {
 	.read = read_file_interrupt,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 void ath_debug_stat_rc(struct ath_softc *sc, int final_rate)
@@ -464,7 +469,8 @@ static ssize_t read_file_rcstat(struct file *file, char __user *user_buf,
 static const struct file_operations fops_rcstat = {
 	.read = read_file_rcstat,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static const char * ath_wiphy_state_str(enum ath_wiphy_state state)
@@ -623,7 +629,8 @@ static const struct file_operations fops_wiphy = {
 	.read = read_file_wiphy,
 	.write = write_file_wiphy,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 #define PR(str, elem)							\
@@ -702,7 +709,8 @@ void ath_debug_stat_tx(struct ath_softc *sc, struct ath_txq *txq,
 static const struct file_operations fops_xmit = {
 	.read = read_file_xmit,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static ssize_t read_file_recv(struct file *file, char __user *user_buf,
@@ -814,7 +822,8 @@ void ath_debug_stat_rx(struct ath_softc *sc, struct ath_rx_status *rs)
 static const struct file_operations fops_recv = {
 	.read = read_file_recv,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static ssize_t read_file_regidx(struct file *file, char __user *user_buf,
@@ -852,7 +861,8 @@ static const struct file_operations fops_regidx = {
 	.read = read_file_regidx,
 	.write = write_file_regidx,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static ssize_t read_file_regval(struct file *file, char __user *user_buf,
@@ -894,7 +904,8 @@ static const struct file_operations fops_regval = {
 	.read = read_file_regval,
 	.write = write_file_regval,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 int ath9k_init_debug(struct ath_hw *ah)
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index 7d09b4b17bbd..bc2ca7d898e9 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -536,7 +536,8 @@ static ssize_t read_file_tgt_stats(struct file *file, char __user *user_buf,
 static const struct file_operations fops_tgt_stats = {
 	.read = read_file_tgt_stats,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static ssize_t read_file_xmit(struct file *file, char __user *user_buf,
@@ -584,7 +585,8 @@ static ssize_t read_file_xmit(struct file *file, char __user *user_buf,
 static const struct file_operations fops_xmit = {
 	.read = read_file_xmit,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 static ssize_t read_file_recv(struct file *file, char __user *user_buf,
@@ -613,7 +615,8 @@ static ssize_t read_file_recv(struct file *file, char __user *user_buf,
 static const struct file_operations fops_recv = {
 	.read = read_file_recv,
 	.open = ath9k_debugfs_open,
-	.owner = THIS_MODULE
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 int ath9k_htc_init_debug(struct ath_hw *ah)
diff --git a/drivers/net/wireless/iwlwifi/iwl-3945-rs.c b/drivers/net/wireless/iwlwifi/iwl-3945-rs.c
index 8e84a08ff951..293e1dbc166c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-3945-rs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-3945-rs.c
@@ -873,6 +873,7 @@ static ssize_t iwl3945_sta_dbgfs_stats_table_read(struct file *file,
 static const struct file_operations rs_sta_dbgfs_stats_table_ops = {
 	.read = iwl3945_sta_dbgfs_stats_table_read,
 	.open = iwl3945_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static void iwl3945_add_debugfs(void *priv, void *priv_sta,
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
index 23e5c42e7d7e..a4378ba31ef6 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
@@ -2873,6 +2873,7 @@ static const struct file_operations rs_sta_dbgfs_scale_table_ops = {
 	.write = rs_sta_dbgfs_scale_table_write,
 	.read = rs_sta_dbgfs_scale_table_read,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 static ssize_t rs_sta_dbgfs_stats_table_read(struct file *file,
 			char __user *user_buf, size_t count, loff_t *ppos)
@@ -2915,6 +2916,7 @@ static ssize_t rs_sta_dbgfs_stats_table_read(struct file *file,
 static const struct file_operations rs_sta_dbgfs_stats_table_ops = {
 	.read = rs_sta_dbgfs_stats_table_read,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t rs_sta_dbgfs_rate_scale_data_read(struct file *file,
@@ -2946,6 +2948,7 @@ static ssize_t rs_sta_dbgfs_rate_scale_data_read(struct file *file,
 static const struct file_operations rs_sta_dbgfs_rate_scale_data_ops = {
 	.read = rs_sta_dbgfs_rate_scale_data_read,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static void rs_add_debugfs(void *priv, void *priv_sta,
diff --git a/drivers/net/wireless/iwmc3200wifi/debugfs.c b/drivers/net/wireless/iwmc3200wifi/debugfs.c
index 53b0b7711f02..0a0cc9667cd6 100644
--- a/drivers/net/wireless/iwmc3200wifi/debugfs.c
+++ b/drivers/net/wireless/iwmc3200wifi/debugfs.c
@@ -402,24 +402,28 @@ static const struct file_operations iwm_debugfs_txq_fops = {
 	.owner =	THIS_MODULE,
 	.open =		iwm_generic_open,
 	.read =		iwm_debugfs_txq_read,
+	.llseek =	default_llseek,
 };
 
 static const struct file_operations iwm_debugfs_tx_credit_fops = {
 	.owner =	THIS_MODULE,
 	.open =		iwm_generic_open,
 	.read =		iwm_debugfs_tx_credit_read,
+	.llseek =	default_llseek,
 };
 
 static const struct file_operations iwm_debugfs_rx_ticket_fops = {
 	.owner =	THIS_MODULE,
 	.open =		iwm_generic_open,
 	.read =		iwm_debugfs_rx_ticket_read,
+	.llseek =	default_llseek,
 };
 
 static const struct file_operations iwm_debugfs_fw_err_fops = {
 	.owner =	THIS_MODULE,
 	.open =		iwm_generic_open,
 	.read =		iwm_debugfs_fw_err_read,
+	.llseek =	default_llseek,
 };
 
 void iwm_debugfs_init(struct iwm_priv *iwm)
diff --git a/drivers/net/wireless/iwmc3200wifi/sdio.c b/drivers/net/wireless/iwmc3200wifi/sdio.c
index edcb52330cf5..56383e7be835 100644
--- a/drivers/net/wireless/iwmc3200wifi/sdio.c
+++ b/drivers/net/wireless/iwmc3200wifi/sdio.c
@@ -364,6 +364,7 @@ static const struct file_operations iwm_debugfs_sdio_fops = {
 	.owner =	THIS_MODULE,
 	.open =		iwm_debugfs_sdio_open,
 	.read =		iwm_debugfs_sdio_read,
+	.llseek =	default_llseek,
 };
 
 static void if_sdio_debugfs_init(struct iwm_priv *iwm, struct dentry *parent_dir)
diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c
index 74e94cc10e01..fbf3b0332bb7 100644
--- a/drivers/net/wireless/libertas/debugfs.c
+++ b/drivers/net/wireless/libertas/debugfs.c
@@ -962,6 +962,7 @@ static const struct file_operations lbs_debug_fops = {
 	.open = open_file_generic,
 	.write = lbs_debugfs_write,
 	.read = lbs_debugfs_read,
+	.llseek = default_llseek,
 };
 
 /**
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 88560d0ae50a..dab30a8c7470 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -2802,6 +2802,7 @@ static ssize_t ray_cs_essid_proc_write(struct file *file,
 static const struct file_operations ray_cs_essid_proc_fops = {
 	.owner		= THIS_MODULE,
 	.write		= ray_cs_essid_proc_write,
+	.llseek		= noop_llseek,
 };
 
 static ssize_t int_proc_write(struct file *file, const char __user *buffer,
@@ -2835,6 +2836,7 @@ static ssize_t int_proc_write(struct file *file, const char __user *buffer,
 static const struct file_operations int_proc_fops = {
 	.owner		= THIS_MODULE,
 	.write		= int_proc_write,
+	.llseek		= noop_llseek,
 };
 #endif
 
diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c
index 7d6f19a2805e..cea81e4c5c82 100644
--- a/drivers/net/wireless/rt2x00/rt2x00debug.c
+++ b/drivers/net/wireless/rt2x00/rt2x00debug.c
@@ -315,6 +315,7 @@ static const struct file_operations rt2x00debug_fop_queue_dump = {
 	.poll		= rt2x00debug_poll_queue_dump,
 	.open		= rt2x00debug_open_queue_dump,
 	.release	= rt2x00debug_release_queue_dump,
+	.llseek		= default_llseek,
 };
 
 static ssize_t rt2x00debug_read_queue_stats(struct file *file,
@@ -371,6 +372,7 @@ static const struct file_operations rt2x00debug_fop_queue_stats = {
 	.read		= rt2x00debug_read_queue_stats,
 	.open		= rt2x00debug_file_open,
 	.release	= rt2x00debug_file_release,
+	.llseek		= default_llseek,
 };
 
 #ifdef CONFIG_RT2X00_LIB_CRYPTO
@@ -423,6 +425,7 @@ static const struct file_operations rt2x00debug_fop_crypto_stats = {
 	.read		= rt2x00debug_read_crypto_stats,
 	.open		= rt2x00debug_file_open,
 	.release	= rt2x00debug_file_release,
+	.llseek		= default_llseek,
 };
 #endif
 
@@ -543,6 +546,7 @@ static const struct file_operations rt2x00debug_fop_dev_flags = {
 	.read		= rt2x00debug_read_dev_flags,
 	.open		= rt2x00debug_file_open,
 	.release	= rt2x00debug_file_release,
+	.llseek		= default_llseek,
 };
 
 static struct dentry *rt2x00debug_create_file_driver(const char *name,
diff --git a/drivers/net/wireless/wl12xx/wl1251_debugfs.c b/drivers/net/wireless/wl12xx/wl1251_debugfs.c
index a4ae7c4d94b5..fa620a5e5303 100644
--- a/drivers/net/wireless/wl12xx/wl1251_debugfs.c
+++ b/drivers/net/wireless/wl12xx/wl1251_debugfs.c
@@ -238,6 +238,7 @@ static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf,
 static const struct file_operations tx_queue_len_ops = {
 	.read = tx_queue_len_read,
 	.open = wl1251_open_file_generic,
+	.llseek = generic_file_llseek,
 };
 
 static ssize_t tx_queue_status_read(struct file *file, char __user *userbuf,
@@ -259,6 +260,7 @@ static ssize_t tx_queue_status_read(struct file *file, char __user *userbuf,
 static const struct file_operations tx_queue_status_ops = {
 	.read = tx_queue_status_read,
 	.open = wl1251_open_file_generic,
+	.llseek = generic_file_llseek,
 };
 
 static void wl1251_debugfs_delete_files(struct wl1251 *wl)
diff --git a/drivers/net/wireless/wl12xx/wl1271_debugfs.c b/drivers/net/wireless/wl12xx/wl1271_debugfs.c
index 6e25303a8e7f..66c2b90ddfd4 100644
--- a/drivers/net/wireless/wl12xx/wl1271_debugfs.c
+++ b/drivers/net/wireless/wl12xx/wl1271_debugfs.c
@@ -239,6 +239,7 @@ static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf,
 static const struct file_operations tx_queue_len_ops = {
 	.read = tx_queue_len_read,
 	.open = wl1271_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t gpio_power_read(struct file *file, char __user *user_buf,
@@ -293,7 +294,8 @@ out:
 static const struct file_operations gpio_power_ops = {
 	.read = gpio_power_read,
 	.write = gpio_power_write,
-	.open = wl1271_open_file_generic
+	.open = wl1271_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static void wl1271_debugfs_delete_files(struct wl1271 *wl)
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index bbd7516e0869..9bb4d4bdc237 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -59,6 +59,7 @@ static ssize_t timeout_write(struct file *file, char const __user *buf,
 static const struct file_operations timeout_fops = {
 	.read		= timeout_read,
 	.write		= timeout_write,
+	.llseek		= default_llseek,
 };
 
 #endif
@@ -93,7 +94,8 @@ static ssize_t depth_write(struct file *file, char const __user *buf, size_t cou
 
 static const struct file_operations depth_fops = {
 	.read		= depth_read,
-	.write		= depth_write
+	.write		= depth_write,
+	.llseek		= default_llseek,
 };
 
 
@@ -105,6 +107,7 @@ static ssize_t pointer_size_read(struct file *file, char __user *buf, size_t cou
 
 static const struct file_operations pointer_size_fops = {
 	.read		= pointer_size_read,
+	.llseek		= default_llseek,
 };
 
 
@@ -116,6 +119,7 @@ static ssize_t cpu_type_read(struct file *file, char __user *buf, size_t count,
 
 static const struct file_operations cpu_type_fops = {
 	.read		= cpu_type_read,
+	.llseek		= default_llseek,
 };
 
 
@@ -151,6 +155,7 @@ static ssize_t enable_write(struct file *file, char const __user *buf, size_t co
 static const struct file_operations enable_fops = {
 	.read		= enable_read,
 	.write		= enable_write,
+	.llseek		= default_llseek,
 };
 
 
@@ -163,6 +168,7 @@ static ssize_t dump_write(struct file *file, char const __user *buf, size_t coun
 
 static const struct file_operations dump_fops = {
 	.write		= dump_write,
+	.llseek		= noop_llseek,
 };
 
 void oprofile_create_files(struct super_block *sb, struct dentry *root)
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 2766a6d3c2e9..6b6a1f71957d 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -117,12 +117,14 @@ static const struct file_operations ulong_fops = {
 	.read		= ulong_read_file,
 	.write		= ulong_write_file,
 	.open		= default_open,
+	.llseek		= default_llseek,
 };
 
 
 static const struct file_operations ulong_ro_fops = {
 	.read		= ulong_read_file,
 	.open		= default_open,
+	.llseek		= default_llseek,
 };
 
 
@@ -183,6 +185,7 @@ static ssize_t atomic_read_file(struct file *file, char __user *buf, size_t coun
 static const struct file_operations atomic_ro_fops = {
 	.read		= atomic_read_file,
 	.open		= default_open,
+	.llseek		= default_llseek,
 };
 
 
diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
index 909924692b8a..b3cf6223f63a 100644
--- a/drivers/pci/pcie/aer/aer_inject.c
+++ b/drivers/pci/pcie/aer/aer_inject.c
@@ -472,6 +472,7 @@ static ssize_t aer_inject_write(struct file *filp, const char __user *ubuf,
 static const struct file_operations aer_inject_fops = {
 	.write = aer_inject_write,
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice aer_inject_device = {
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index e3154ff7a39f..f200677851b8 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -2360,6 +2360,7 @@ static const struct file_operations sonypi_misc_fops = {
 	.release	= sonypi_misc_release,
 	.fasync		= sonypi_misc_fasync,
 	.unlocked_ioctl	= sonypi_misc_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice sonypi_misc_device = {
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index d60557cae8ef..ef151ce04430 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -748,6 +748,7 @@ static const struct file_operations wdt_fops = {
 	.write	= wdt_write,
 	.open	= wdt_open,
 	.release = wdt_release,
+	.llseek = no_llseek,
 };
 
 static struct miscdevice wdt_dev = {
diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c
index 7158f9528ecc..c71d89dba302 100644
--- a/drivers/s390/block/dasd_eer.c
+++ b/drivers/s390/block/dasd_eer.c
@@ -670,6 +670,7 @@ static const struct file_operations dasd_eer_fops = {
 	.read		= &dasd_eer_read,
 	.poll		= &dasd_eer_poll,
 	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice *dasd_eer_dev = NULL;
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 857dfcb7b359..eb28fb01a38a 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -520,6 +520,7 @@ static const struct file_operations fs3270_fops = {
 	.compat_ioctl	 = fs3270_ioctl,	/* ioctl */
 	.open		 = fs3270_open,		/* open */
 	.release	 = fs3270_close,	/* release */
+	.llseek		= no_llseek,
 };
 
 /*
diff --git a/drivers/s390/char/monreader.c b/drivers/s390/char/monreader.c
index e021ec663ef9..5b8b8592d311 100644
--- a/drivers/s390/char/monreader.c
+++ b/drivers/s390/char/monreader.c
@@ -447,6 +447,7 @@ static const struct file_operations mon_fops = {
 	.release = &mon_close,
 	.read    = &mon_read,
 	.poll    = &mon_poll,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice mon_dev = {
diff --git a/drivers/s390/char/monwriter.c b/drivers/s390/char/monwriter.c
index 572a1e7fd099..e0702d3ea33b 100644
--- a/drivers/s390/char/monwriter.c
+++ b/drivers/s390/char/monwriter.c
@@ -274,6 +274,7 @@ static const struct file_operations monwrite_fops = {
 	.open	 = &monwrite_open,
 	.release = &monwrite_close,
 	.write	 = &monwrite_write,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice mon_dev = {
diff --git a/drivers/s390/char/tape_char.c b/drivers/s390/char/tape_char.c
index 539045acaad4..883e2db02bd3 100644
--- a/drivers/s390/char/tape_char.c
+++ b/drivers/s390/char/tape_char.c
@@ -53,6 +53,7 @@ static const struct file_operations tape_fops =
 #endif
 	.open = tapechar_open,
 	.release = tapechar_release,
+	.llseek = no_llseek,
 };
 
 static int tapechar_major = TAPECHAR_MAJOR;
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 04e532eec032..0e7cb1a84151 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -177,6 +177,7 @@ static const struct file_operations vmcp_fops = {
 	.write		= vmcp_write,
 	.unlocked_ioctl	= vmcp_ioctl,
 	.compat_ioctl	= vmcp_ioctl,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice vmcp_dev = {
diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c
index e40a1b892866..0d6dc4b92cc2 100644
--- a/drivers/s390/char/vmlogrdr.c
+++ b/drivers/s390/char/vmlogrdr.c
@@ -97,6 +97,7 @@ static const struct file_operations vmlogrdr_fops = {
 	.open    = vmlogrdr_open,
 	.release = vmlogrdr_release,
 	.read    = vmlogrdr_read,
+	.llseek  = no_llseek,
 };
 
 
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index e13508c98b1a..12ef9121d4f0 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -297,6 +297,7 @@ static const struct file_operations vmwdt_fops = {
 	.unlocked_ioctl = &vmwdt_ioctl,
 	.write   = &vmwdt_write,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice vmwdt_dev = {
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index f5ea3384a4b9..3b94044027c2 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -459,6 +459,7 @@ static const struct file_operations zcore_memmap_fops = {
 	.read		= zcore_memmap_read,
 	.open		= zcore_memmap_open,
 	.release	= zcore_memmap_release,
+	.llseek		= no_llseek,
 };
 
 static ssize_t zcore_reipl_write(struct file *filp, const char __user *buf,
@@ -486,6 +487,7 @@ static const struct file_operations zcore_reipl_fops = {
 	.write		= zcore_reipl_write,
 	.open		= zcore_reipl_open,
 	.release	= zcore_reipl_release,
+	.llseek		= no_llseek,
 };
 
 #ifdef CONFIG_32BIT
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index a83877c664a6..f2b77e7bfc6f 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -806,6 +806,7 @@ static const struct file_operations chsc_fops = {
 	.open = nonseekable_open,
 	.unlocked_ioctl = chsc_ioctl,
 	.compat_ioctl = chsc_ioctl,
+	.llseek = no_llseek,
 };
 
 static struct miscdevice chsc_misc_device = {
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index ac94ac751459..ca8e1c240c3c 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -1067,6 +1067,7 @@ static ssize_t cio_settle_write(struct file *file, const char __user *buf,
 static const struct file_operations cio_settle_proc_fops = {
 	.open = nonseekable_open,
 	.write = cio_settle_write,
+	.llseek = no_llseek,
 };
 
 static int __init cio_settle_init(void)
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 41e0aaefafd5..f5221749d180 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -897,7 +897,8 @@ static const struct file_operations zcrypt_fops = {
 	.compat_ioctl	= zcrypt_compat_ioctl,
 #endif
 	.open		= zcrypt_open,
-	.release	= zcrypt_release
+	.release	= zcrypt_release,
+	.llseek		= no_llseek,
 };
 
 /*
diff --git a/drivers/s390/scsi/zfcp_cfdc.c b/drivers/s390/scsi/zfcp_cfdc.c
index fcbd2b756da4..1838cda68ba8 100644
--- a/drivers/s390/scsi/zfcp_cfdc.c
+++ b/drivers/s390/scsi/zfcp_cfdc.c
@@ -251,8 +251,9 @@ static const struct file_operations zfcp_cfdc_fops = {
 	.open = nonseekable_open,
 	.unlocked_ioctl = zfcp_cfdc_dev_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl = zfcp_cfdc_dev_ioctl
+	.compat_ioctl = zfcp_cfdc_dev_ioctl,
 #endif
+	.llseek = no_llseek,
 };
 
 struct miscdevice zfcp_cfdc_misc = {
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 1690e53fb84a..55f71ea9c418 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -162,6 +162,7 @@ static const struct file_operations d7s_fops = {
 	.compat_ioctl =		d7s_ioctl,
 	.open =			d7s_open,
 	.release =		d7s_release,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice d7s_miscdev = {
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 078e5f4520ef..8ce414e39489 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -720,6 +720,7 @@ static const struct file_operations envctrl_fops = {
 #endif
 	.open =			envctrl_open,
 	.release =		envctrl_release,
+	.llseek =		noop_llseek,
 };	
 
 static struct miscdevice envctrl_dev = {
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index e20b7bdd4c78..67aad69cfbc2 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -222,7 +222,8 @@ static const struct file_operations twa_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= twa_chrdev_ioctl,
 	.open		= twa_chrdev_open,
-	.release	= NULL
+	.release	= NULL,
+	.llseek		= noop_llseek,
 };
 
 /* This function will complete an aen request from the isr */
diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c
index f481e734aad4..7afac93d889f 100644
--- a/drivers/scsi/3w-sas.c
+++ b/drivers/scsi/3w-sas.c
@@ -890,7 +890,8 @@ static const struct file_operations twl_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= twl_chrdev_ioctl,
 	.open		= twl_chrdev_open,
-	.release	= NULL
+	.release	= NULL,
+	.llseek		= noop_llseek,
 };
 
 /* This function passes sense data from firmware to scsi layer */
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 30d735ad35b5..5a2337306037 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -1059,7 +1059,8 @@ static const struct file_operations tw_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= tw_chrdev_ioctl,
 	.open		= tw_chrdev_open,
-	.release	= NULL
+	.release	= NULL,
+	.llseek		= noop_llseek,
 };
 
 /* This function will free up device extension resources */
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index cad6f9abaeb9..13af86eec96e 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1039,6 +1039,7 @@ static const struct file_operations aac_cfg_fops = {
 	.compat_ioctl   = aac_compat_cfg_ioctl,
 #endif
 	.open		= aac_cfg_open,
+	.llseek		= noop_llseek,
 };
 
 static struct scsi_host_template aac_driver_template = {
diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c
index d6532187f616..e40c9f7a002a 100644
--- a/drivers/scsi/ch.c
+++ b/drivers/scsi/ch.c
@@ -981,6 +981,7 @@ static const struct file_operations changer_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ch_ioctl_compat,
 #endif
+	.llseek		= noop_llseek,
 };
 
 static int __init init_ch_module(void)
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index ffc1edf5e80d..7a61206ed1c0 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -126,6 +126,7 @@ static const struct file_operations adpt_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_adpt_ioctl,
 #endif
+	.llseek		= noop_llseek,
 };
 
 /* Structures and definitions for synchronous message posting.
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index b860d650a563..9886586df01f 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -372,6 +372,7 @@ static const struct file_operations gdth_fops = {
     .unlocked_ioctl   = gdth_unlocked_ioctl,
     .open    = gdth_open,
     .release = gdth_close,
+    .llseek = noop_llseek,
 };
 
 #include "gdth_proc.h"
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 0b6e3228610a..aac80f7bb999 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -101,6 +101,7 @@ static const struct file_operations megadev_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= megadev_unlocked_ioctl,
 	.open		= megadev_open,
+	.llseek		= noop_llseek,
 };
 
 /*
diff --git a/drivers/scsi/megaraid/megaraid_mm.c b/drivers/scsi/megaraid/megaraid_mm.c
index 41f82f76d884..ab801232d777 100644
--- a/drivers/scsi/megaraid/megaraid_mm.c
+++ b/drivers/scsi/megaraid/megaraid_mm.c
@@ -75,6 +75,7 @@ static const struct file_operations lsi_fops = {
 	.compat_ioctl = mraid_mm_compat_ioctl,
 #endif
 	.owner	= THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice megaraid_mm_dev = {
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index 99e4478c3f3e..209cc87b9a32 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -3957,6 +3957,7 @@ static const struct file_operations megasas_mgmt_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = megasas_mgmt_compat_ioctl,
 #endif
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/drivers/scsi/mpt2sas/mpt2sas_ctl.c b/drivers/scsi/mpt2sas/mpt2sas_ctl.c
index b774973f0765..9a4e584ae3cc 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_ctl.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_ctl.c
@@ -2952,6 +2952,7 @@ static const struct file_operations ctl_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = _ctl_ioctl_compat,
 #endif
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice ctl_dev = {
diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c
index ffdd9fdb9995..b31a8e3841d7 100644
--- a/drivers/scsi/osd/osd_uld.c
+++ b/drivers/scsi/osd/osd_uld.c
@@ -182,6 +182,7 @@ static const struct file_operations osd_fops = {
 	.open           = osd_uld_open,
 	.release        = osd_uld_release,
 	.unlocked_ioctl = osd_uld_ioctl,
+	.llseek		= noop_llseek,
 };
 
 struct osd_dev *osduld_path_lookup(const char *name)
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index ecc45c8b4e6b..4b8765785aeb 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -4165,6 +4165,7 @@ static const struct file_operations pmcraid_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = pmcraid_chr_ioctl,
 #endif
+	.llseek = noop_llseek,
 };
 
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 1e4bff695254..9946fac54255 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -3948,6 +3948,7 @@ static struct pci_driver qla2xxx_pci_driver = {
 
 static struct file_operations apidev_fops = {
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 /**
diff --git a/drivers/scsi/scsi_tgt_if.c b/drivers/scsi/scsi_tgt_if.c
index a87e21c35ef2..bbb02f6e917c 100644
--- a/drivers/scsi/scsi_tgt_if.c
+++ b/drivers/scsi/scsi_tgt_if.c
@@ -333,6 +333,7 @@ static const struct file_operations tgt_fops = {
 	.poll		= tgt_poll,
 	.write		= tgt_write,
 	.mmap		= tgt_mmap,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice tgt_miscdev = {
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 78d616315d8e..00baea1e2dbf 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1351,6 +1351,7 @@ static const struct file_operations sg_fops = {
 	.mmap = sg_mmap,
 	.release = sg_release,
 	.fasync = sg_fasync,
+	.llseek = no_llseek,
 };
 
 static struct class *sg_sysfs_class;
diff --git a/drivers/serial/mfd.c b/drivers/serial/mfd.c
index bc9af503907f..6703f3e802a2 100644
--- a/drivers/serial/mfd.c
+++ b/drivers/serial/mfd.c
@@ -227,12 +227,14 @@ static const struct file_operations port_regs_ops = {
 	.owner		= THIS_MODULE,
 	.open		= hsu_show_regs_open,
 	.read		= port_show_regs,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations dma_regs_ops = {
 	.owner		= THIS_MODULE,
 	.open		= hsu_show_regs_open,
 	.read		= dma_show_regs,
+	.llseek		= default_llseek,
 };
 
 static int hsu_debugfs_init(struct hsu_port *hsu)
diff --git a/drivers/spi/dw_spi.c b/drivers/spi/dw_spi.c
index d256cb00604c..a32aa66d66bf 100644
--- a/drivers/spi/dw_spi.c
+++ b/drivers/spi/dw_spi.c
@@ -131,6 +131,7 @@ static const struct file_operations mrst_spi_regs_ops = {
 	.owner		= THIS_MODULE,
 	.open		= spi_show_regs_open,
 	.read		= spi_show_regs,
+	.llseek		= default_llseek,
 };
 
 static int mrst_spi_debugfs_init(struct dw_spi *dws)
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index ea1bec3c9a13..4e6245e67995 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -545,6 +545,7 @@ static const struct file_operations spidev_fops = {
 	.unlocked_ioctl = spidev_ioctl,
 	.open =		spidev_open,
 	.release =	spidev_release,
+	.llseek =	no_llseek,
 };
 
 /*-------------------------------------------------------------------------*/
diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index 14091313cebb..fecb89e8c663 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -1922,6 +1922,7 @@ const struct file_operations comedi_fops = {
 	.mmap = comedi_mmap,
 	.poll = comedi_poll,
 	.fasync = comedi_fasync,
+	.llseek = noop_llseek,
 };
 
 struct class *comedi_class;
diff --git a/drivers/staging/crystalhd/crystalhd_lnx.c b/drivers/staging/crystalhd/crystalhd_lnx.c
index fbb80f09a3d9..af258991fe7f 100644
--- a/drivers/staging/crystalhd/crystalhd_lnx.c
+++ b/drivers/staging/crystalhd/crystalhd_lnx.c
@@ -351,6 +351,7 @@ static const struct file_operations chd_dec_fops = {
 	.unlocked_ioctl = chd_dec_ioctl,
 	.open    = chd_dec_open,
 	.release = chd_dec_close,
+	.llseek = noop_llseek,
 };
 
 static int __devinit chd_dec_init_chdev(struct crystalhd_adp *adp)
diff --git a/drivers/staging/dream/camera/msm_camera.c b/drivers/staging/dream/camera/msm_camera.c
index 81bd71fd816e..de4ab61efd4b 100644
--- a/drivers/staging/dream/camera/msm_camera.c
+++ b/drivers/staging/dream/camera/msm_camera.c
@@ -1941,6 +1941,7 @@ static const struct file_operations msm_fops_config = {
 	.open = msm_open,
 	.unlocked_ioctl = msm_ioctl_config,
 	.release = msm_release_config,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations msm_fops_control = {
@@ -1948,6 +1949,7 @@ static const struct file_operations msm_fops_control = {
 	.open = msm_open_control,
 	.unlocked_ioctl = msm_ioctl_control,
 	.release = msm_release_control,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations msm_fops_frame = {
@@ -1956,6 +1958,7 @@ static const struct file_operations msm_fops_frame = {
 	.unlocked_ioctl = msm_ioctl_frame,
 	.release = msm_release_frame,
 	.poll = msm_poll_frame,
+	.llseek = no_llseek,
 };
 
 static int msm_setup_cdev(struct msm_device *msm,
diff --git a/drivers/staging/dream/pmem.c b/drivers/staging/dream/pmem.c
index 7d6bbadd7fc7..3640d1f2376d 100644
--- a/drivers/staging/dream/pmem.c
+++ b/drivers/staging/dream/pmem.c
@@ -180,6 +180,7 @@ const struct file_operations pmem_fops = {
 	.mmap = pmem_mmap,
 	.open = pmem_open,
 	.unlocked_ioctl = pmem_ioctl,
+	.llseek = noop_llseek,
 };
 
 static int get_id(struct file *file)
@@ -1204,6 +1205,7 @@ static ssize_t debug_read(struct file *file, char __user *buf, size_t count,
 static struct file_operations debug_fops = {
 	.read = debug_read,
 	.open = debug_open,
+	.llseek = default_llseek,
 };
 #endif
 
diff --git a/drivers/staging/dream/qdsp5/adsp_driver.c b/drivers/staging/dream/qdsp5/adsp_driver.c
index 8197765aae1e..28a6f8da9477 100644
--- a/drivers/staging/dream/qdsp5/adsp_driver.c
+++ b/drivers/staging/dream/qdsp5/adsp_driver.c
@@ -582,6 +582,7 @@ static struct file_operations adsp_fops = {
 	.open = adsp_open,
 	.unlocked_ioctl = adsp_ioctl,
 	.release = adsp_release,
+	.llseek = no_llseek,
 };
 
 static void adsp_create(struct adsp_device *adev, const char *name,
diff --git a/drivers/staging/dream/qdsp5/audio_aac.c b/drivers/staging/dream/qdsp5/audio_aac.c
index a373f3522384..45f4c78ab6e7 100644
--- a/drivers/staging/dream/qdsp5/audio_aac.c
+++ b/drivers/staging/dream/qdsp5/audio_aac.c
@@ -1030,6 +1030,7 @@ static struct file_operations audio_aac_fops = {
 	.read = audio_read,
 	.write = audio_write,
 	.unlocked_ioctl = audio_ioctl,
+	.llseek = noop_llseek,
 };
 
 struct miscdevice audio_aac_misc = {
diff --git a/drivers/staging/dream/qdsp5/audio_amrnb.c b/drivers/staging/dream/qdsp5/audio_amrnb.c
index 07b79d5836e5..402bbc13281a 100644
--- a/drivers/staging/dream/qdsp5/audio_amrnb.c
+++ b/drivers/staging/dream/qdsp5/audio_amrnb.c
@@ -841,6 +841,7 @@ static struct file_operations audio_amrnb_fops = {
 	.read = audamrnb_read,
 	.write = audamrnb_write,
 	.unlocked_ioctl = audamrnb_ioctl,
+	.llseek = noop_llseek,
 };
 
 struct miscdevice audio_amrnb_misc = {
diff --git a/drivers/staging/dream/qdsp5/audio_evrc.c b/drivers/staging/dream/qdsp5/audio_evrc.c
index ad989ee87690..24a892647370 100644
--- a/drivers/staging/dream/qdsp5/audio_evrc.c
+++ b/drivers/staging/dream/qdsp5/audio_evrc.c
@@ -813,6 +813,7 @@ static struct file_operations audio_evrc_fops = {
 	.read = audevrc_read,
 	.write = audevrc_write,
 	.unlocked_ioctl = audevrc_ioctl,
+	.llseek = noop_llseek,
 };
 
 struct miscdevice audio_evrc_misc = {
diff --git a/drivers/staging/dream/qdsp5/audio_in.c b/drivers/staging/dream/qdsp5/audio_in.c
index 6ae48e72d145..b51fa096074e 100644
--- a/drivers/staging/dream/qdsp5/audio_in.c
+++ b/drivers/staging/dream/qdsp5/audio_in.c
@@ -921,12 +921,14 @@ static struct file_operations audio_fops = {
 	.read		= audio_in_read,
 	.write		= audio_in_write,
 	.unlocked_ioctl	= audio_in_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct file_operations audpre_fops = {
 	.owner          = THIS_MODULE,
 	.open           = audpre_open,
 	.unlocked_ioctl = audpre_ioctl,
+	.llseek		= noop_llseek,
 };
 
 struct miscdevice audio_in_misc = {
diff --git a/drivers/staging/dream/qdsp5/audio_mp3.c b/drivers/staging/dream/qdsp5/audio_mp3.c
index 530e1f35eed3..409a19ce6039 100644
--- a/drivers/staging/dream/qdsp5/audio_mp3.c
+++ b/drivers/staging/dream/qdsp5/audio_mp3.c
@@ -948,6 +948,7 @@ static struct file_operations audio_mp3_fops = {
 	.read		= audio_read,
 	.write		= audio_write,
 	.unlocked_ioctl	= audio_ioctl,
+	.llseek		= noop_llseek,
 };
 
 struct miscdevice audio_mp3_misc = {
diff --git a/drivers/staging/dream/qdsp5/audio_out.c b/drivers/staging/dream/qdsp5/audio_out.c
index 76d7fa5667d5..d20e89541567 100644
--- a/drivers/staging/dream/qdsp5/audio_out.c
+++ b/drivers/staging/dream/qdsp5/audio_out.c
@@ -807,12 +807,14 @@ static struct file_operations audio_fops = {
 	.read		= audio_read,
 	.write		= audio_write,
 	.unlocked_ioctl	= audio_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct file_operations audpp_fops = {
 	.owner		= THIS_MODULE,
 	.open		= audpp_open,
 	.unlocked_ioctl	= audpp_ioctl,
+	.llseek		= noop_llseek,
 };
 
 struct miscdevice audio_misc = {
diff --git a/drivers/staging/dream/qdsp5/audio_qcelp.c b/drivers/staging/dream/qdsp5/audio_qcelp.c
index effa96f34fdc..911bab416b85 100644
--- a/drivers/staging/dream/qdsp5/audio_qcelp.c
+++ b/drivers/staging/dream/qdsp5/audio_qcelp.c
@@ -824,6 +824,7 @@ static struct file_operations audio_qcelp_fops = {
 	.read = audqcelp_read,
 	.write = audqcelp_write,
 	.unlocked_ioctl = audqcelp_ioctl,
+	.llseek = noop_llseek,
 };
 
 struct miscdevice audio_qcelp_misc = {
diff --git a/drivers/staging/dream/qdsp5/evlog.h b/drivers/staging/dream/qdsp5/evlog.h
index 922ce670a32a..e5ab86b9dd7c 100644
--- a/drivers/staging/dream/qdsp5/evlog.h
+++ b/drivers/staging/dream/qdsp5/evlog.h
@@ -123,6 +123,7 @@ static int ev_log_open(struct inode *inode, struct file *file)
 static const struct file_operations ev_log_ops = {
 	.read = ev_log_read,
 	.open = ev_log_open,
+	.llseek = default_llseek,
 };
 
 static int ev_log_init(struct ev_log *log)
diff --git a/drivers/staging/dream/qdsp5/snd.c b/drivers/staging/dream/qdsp5/snd.c
index 037d7ffb7e67..e0f2f7bca29e 100644
--- a/drivers/staging/dream/qdsp5/snd.c
+++ b/drivers/staging/dream/qdsp5/snd.c
@@ -247,6 +247,7 @@ static struct file_operations snd_fops = {
 	.open		= snd_open,
 	.release	= snd_release,
 	.unlocked_ioctl	= snd_ioctl,
+	.llseek		= noop_llseek,
 };
 
 struct miscdevice snd_misc = {
diff --git a/drivers/staging/frontier/alphatrack.c b/drivers/staging/frontier/alphatrack.c
index 4e52105e6070..689099b57fd2 100644
--- a/drivers/staging/frontier/alphatrack.c
+++ b/drivers/staging/frontier/alphatrack.c
@@ -641,6 +641,7 @@ static const struct file_operations usb_alphatrack_fops = {
 	.open = usb_alphatrack_open,
 	.release = usb_alphatrack_release,
 	.poll = usb_alphatrack_poll,
+	.llseek = no_llseek,
 };
 
 /*
diff --git a/drivers/staging/frontier/tranzport.c b/drivers/staging/frontier/tranzport.c
index eed74f0fe0b6..3d12c1737edc 100644
--- a/drivers/staging/frontier/tranzport.c
+++ b/drivers/staging/frontier/tranzport.c
@@ -767,6 +767,7 @@ static const struct file_operations usb_tranzport_fops = {
 	.open = usb_tranzport_open,
 	.release = usb_tranzport_release,
 	.poll = usb_tranzport_poll,
+	.llseek = no_llseek,
 };
 
 /*
diff --git a/drivers/staging/iio/industrialio-core.c b/drivers/staging/iio/industrialio-core.c
index dd4d87a8bcaf..92a212f064bd 100644
--- a/drivers/staging/iio/industrialio-core.c
+++ b/drivers/staging/iio/industrialio-core.c
@@ -349,6 +349,7 @@ static const struct file_operations iio_event_chrdev_fileops = {
 	.release = iio_event_chrdev_release,
 	.open = iio_event_chrdev_open,
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static void iio_event_dev_release(struct device *dev)
diff --git a/drivers/staging/iio/industrialio-ring.c b/drivers/staging/iio/industrialio-ring.c
index 6ab578e4f5f3..1c5f67253b82 100644
--- a/drivers/staging/iio/industrialio-ring.c
+++ b/drivers/staging/iio/industrialio-ring.c
@@ -133,6 +133,7 @@ static const struct file_operations iio_ring_fileops = {
 	.release = iio_ring_release,
 	.open = iio_ring_open,
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 /**
diff --git a/drivers/staging/lirc/lirc_imon.c b/drivers/staging/lirc/lirc_imon.c
index 66493253042e..ed5c5fe022c9 100644
--- a/drivers/staging/lirc/lirc_imon.c
+++ b/drivers/staging/lirc/lirc_imon.c
@@ -115,7 +115,8 @@ static const struct file_operations display_fops = {
 	.owner		= THIS_MODULE,
 	.open		= &display_open,
 	.write		= &vfd_write,
-	.release	= &display_close
+	.release	= &display_close,
+	.llseek		= noop_llseek,
 };
 
 /*
diff --git a/drivers/staging/lirc/lirc_it87.c b/drivers/staging/lirc/lirc_it87.c
index ec11c0e949a0..543c5c3bf907 100644
--- a/drivers/staging/lirc/lirc_it87.c
+++ b/drivers/staging/lirc/lirc_it87.c
@@ -342,6 +342,7 @@ static const struct file_operations lirc_fops = {
 	.unlocked_ioctl	= lirc_ioctl,
 	.open		= lirc_open,
 	.release	= lirc_close,
+	.llseek		= noop_llseek,
 };
 
 static int set_use_inc(void *data)
diff --git a/drivers/staging/lirc/lirc_sasem.c b/drivers/staging/lirc/lirc_sasem.c
index 73166c3f581f..8f72a84f34ec 100644
--- a/drivers/staging/lirc/lirc_sasem.c
+++ b/drivers/staging/lirc/lirc_sasem.c
@@ -125,6 +125,7 @@ static const struct file_operations vfd_fops = {
 	.write		= &vfd_write,
 	.unlocked_ioctl	= &vfd_ioctl,
 	.release	= &vfd_close,
+	.llseek		= noop_llseek,
 };
 
 /* USB Device ID for Sasem USB Control Board */
diff --git a/drivers/staging/memrar/memrar_handler.c b/drivers/staging/memrar/memrar_handler.c
index a98b3f1f11e0..cfcaa8e5b8e6 100644
--- a/drivers/staging/memrar/memrar_handler.c
+++ b/drivers/staging/memrar/memrar_handler.c
@@ -890,6 +890,7 @@ static const struct file_operations memrar_fops = {
 	.mmap           = memrar_mmap,
 	.open           = memrar_open,
 	.release        = memrar_release,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice memrar_miscdev = {
diff --git a/drivers/staging/panel/panel.c b/drivers/staging/panel/panel.c
index 3221814a856e..6885f9a46609 100644
--- a/drivers/staging/panel/panel.c
+++ b/drivers/staging/panel/panel.c
@@ -1631,6 +1631,7 @@ static const struct file_operations keypad_fops = {
 	.read    = keypad_read,		/* read */
 	.open    = keypad_open,		/* open */
 	.release = keypad_release,	/* close */
+	.llseek  = default_llseek,
 };
 
 static struct miscdevice keypad_dev = {
diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c
index 7ee89492a755..7b3a7d04a109 100644
--- a/drivers/staging/tidspbridge/rmgr/drv_interface.c
+++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c
@@ -144,6 +144,7 @@ static const struct file_operations bridge_fops = {
 	.release = bridge_release,
 	.unlocked_ioctl = bridge_ioctl,
 	.mmap = bridge_mmap,
+	.llseek = noop_llseek,
 };
 
 #ifdef CONFIG_PM
diff --git a/drivers/telephony/ixj.c b/drivers/telephony/ixj.c
index b53deee25d74..5c6239e5aa21 100644
--- a/drivers/telephony/ixj.c
+++ b/drivers/telephony/ixj.c
@@ -6676,7 +6676,8 @@ static const struct file_operations ixj_fops =
         .poll           = ixj_poll,
         .unlocked_ioctl = ixj_ioctl,
         .release        = ixj_release,
-        .fasync         = ixj_fasync
+        .fasync         = ixj_fasync,
+        .llseek	 = default_llseek,
 };
 
 static int ixj_linetest(IXJ *j)
diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c
index f3873f650bb4..1915af201175 100644
--- a/drivers/telephony/phonedev.c
+++ b/drivers/telephony/phonedev.c
@@ -130,6 +130,7 @@ static const struct file_operations phone_fops =
 {
 	.owner		= THIS_MODULE,
 	.open		= phone_open,
+	.llseek		= noop_llseek,
 };
 
 /*
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index bff1afbde5a4..4d3a6fd1a152 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -740,6 +740,7 @@ static const struct file_operations uio_fops = {
 	.mmap		= uio_mmap,
 	.poll		= uio_poll,
 	.fasync		= uio_fasync,
+	.llseek		= noop_llseek,
 };
 
 static int uio_major_init(void)
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 094c76b5de17..6ee4451bfe2d 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -584,7 +584,8 @@ static const struct file_operations wdm_fops = {
 	.open =		wdm_open,
 	.flush =	wdm_flush,
 	.release =	wdm_release,
-	.poll =		wdm_poll
+	.poll =		wdm_poll,
+	.llseek =	noop_llseek,
 };
 
 static struct usb_class_driver wdm_class = {
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index e325162859b0..9eca4053312e 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1043,6 +1043,7 @@ static const struct file_operations usblp_fops = {
 	.compat_ioctl =		usblp_ioctl,
 	.open =		usblp_open,
 	.release =	usblp_release,
+	.llseek =	noop_llseek,
 };
 
 static char *usblp_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 3e7c1b800ebb..6a54634ab823 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -987,6 +987,7 @@ static const struct file_operations fops = {
 	.open		= usbtmc_open,
 	.release	= usbtmc_release,
 	.unlocked_ioctl	= usbtmc_ioctl,
+	.llseek		= default_llseek,
 };
 
 static struct usb_class_driver usbtmc_class = {
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index f06f5dbc8cdc..580bcd396839 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -59,6 +59,7 @@ static int usb_open(struct inode * inode, struct file * file)
 static const struct file_operations usb_fops = {
 	.owner =	THIS_MODULE,
 	.open =		usb_open,
+	.llseek =	noop_llseek,
 };
 
 static struct usb_class {
diff --git a/drivers/usb/gadget/f_hid.c b/drivers/usb/gadget/f_hid.c
index 53e120208e99..2b98bd26364b 100644
--- a/drivers/usb/gadget/f_hid.c
+++ b/drivers/usb/gadget/f_hid.c
@@ -451,6 +451,7 @@ const struct file_operations f_hidg_fops = {
 	.write		= f_hidg_write,
 	.read		= f_hidg_read,
 	.poll		= f_hidg_poll,
+	.llseek		= noop_llseek,
 };
 
 static int __init hidg_bind(struct usb_configuration *c, struct usb_function *f)
diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c
index cf241c371a71..327a92a137b4 100644
--- a/drivers/usb/gadget/printer.c
+++ b/drivers/usb/gadget/printer.c
@@ -884,7 +884,8 @@ static const struct file_operations printer_io_operations = {
 	.fsync =	printer_fsync,
 	.poll =		printer_poll,
 	.unlocked_ioctl = printer_ioctl,
-	.release =	printer_close
+	.release =	printer_close,
+	.llseek =	noop_llseek,
 };
 
 /*-------------------------------------------------------------------------*/
diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index 76b7fd2d838a..86afdc73322f 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -369,18 +369,21 @@ static const struct file_operations debug_async_fops = {
 	.open		= debug_async_open,
 	.read		= debug_output,
 	.release	= debug_close,
+	.llseek		= default_llseek,
 };
 static const struct file_operations debug_periodic_fops = {
 	.owner		= THIS_MODULE,
 	.open		= debug_periodic_open,
 	.read		= debug_output,
 	.release	= debug_close,
+	.llseek		= default_llseek,
 };
 static const struct file_operations debug_registers_fops = {
 	.owner		= THIS_MODULE,
 	.open		= debug_registers_open,
 	.read		= debug_output,
 	.release	= debug_close,
+	.llseek		= default_llseek,
 };
 static const struct file_operations debug_lpm_fops = {
 	.owner		= THIS_MODULE,
@@ -388,6 +391,7 @@ static const struct file_operations debug_lpm_fops = {
 	.read		= debug_lpm_read,
 	.write		= debug_lpm_write,
 	.release	= debug_lpm_close,
+	.llseek		= noop_llseek,
 };
 
 static struct dentry *ehci_debug_root;
diff --git a/drivers/usb/host/ohci-dbg.c b/drivers/usb/host/ohci-dbg.c
index 36abd2baa3ea..d7d34492934a 100644
--- a/drivers/usb/host/ohci-dbg.c
+++ b/drivers/usb/host/ohci-dbg.c
@@ -413,18 +413,21 @@ static const struct file_operations debug_async_fops = {
 	.open		= debug_async_open,
 	.read		= debug_output,
 	.release	= debug_close,
+	.llseek		= default_llseek,
 };
 static const struct file_operations debug_periodic_fops = {
 	.owner		= THIS_MODULE,
 	.open		= debug_periodic_open,
 	.read		= debug_output,
 	.release	= debug_close,
+	.llseek		= default_llseek,
 };
 static const struct file_operations debug_registers_fops = {
 	.owner		= THIS_MODULE,
 	.open		= debug_registers_open,
 	.read		= debug_output,
 	.release	= debug_close,
+	.llseek		= default_llseek,
 };
 
 static struct dentry *ohci_debug_root;
diff --git a/drivers/usb/image/mdc800.c b/drivers/usb/image/mdc800.c
index e192e8f7c560..575b56c79e97 100644
--- a/drivers/usb/image/mdc800.c
+++ b/drivers/usb/image/mdc800.c
@@ -963,6 +963,7 @@ static const struct file_operations mdc800_device_ops =
 	.write =	mdc800_device_write,
 	.open =		mdc800_device_open,
 	.release =	mdc800_device_release,
+	.llseek =	noop_llseek,
 };
 
 
diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index 801324af9470..44f8b9225054 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c
@@ -679,6 +679,7 @@ static const struct file_operations adu_fops = {
 	.write = adu_write,
 	.open = adu_open,
 	.release = adu_release,
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c
index a54c3cb804ce..c6184b4d1695 100644
--- a/drivers/usb/misc/idmouse.c
+++ b/drivers/usb/misc/idmouse.c
@@ -105,6 +105,7 @@ static const struct file_operations idmouse_fops = {
 	.read = idmouse_read,
 	.open = idmouse_open,
 	.release = idmouse_release,
+	.llseek = default_llseek,
 };
 
 /* class driver information */
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index bc88c79875a1..9b50db257019 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -730,6 +730,7 @@ static const struct file_operations iowarrior_fops = {
 	.open = iowarrior_open,
 	.release = iowarrior_release,
 	.poll = iowarrior_poll,
+	.llseek = noop_llseek,
 };
 
 static char *iowarrior_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index dd41d8710043..edffef642337 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -613,6 +613,7 @@ static const struct file_operations ld_usb_fops = {
 	.open =		ld_usb_open,
 	.release =	ld_usb_release,
 	.poll =		ld_usb_poll,
+	.llseek =	no_llseek,
 };
 
 /*
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index cc13ae61712a..4e23d3841b43 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -439,6 +439,7 @@ static const struct file_operations usb_rio_fops = {
 	.unlocked_ioctl = ioctl_rio,
 	.open =		open_rio,
 	.release =	close_rio,
+	.llseek =	noop_llseek,
 };
 
 static struct usb_class_driver usb_rio_class = {
diff --git a/drivers/usb/misc/usblcd.c b/drivers/usb/misc/usblcd.c
index d00dde19194c..51648154bb44 100644
--- a/drivers/usb/misc/usblcd.c
+++ b/drivers/usb/misc/usblcd.c
@@ -282,6 +282,7 @@ static const struct file_operations lcd_fops = {
         .open =         lcd_open,
 	.unlocked_ioctl = lcd_ioctl,
         .release =      lcd_release,
+        .llseek =	 noop_llseek,
 };
 
 /*
diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c
index 552679b8dbd1..e24ce3123071 100644
--- a/drivers/usb/usb-skeleton.c
+++ b/drivers/usb/usb-skeleton.c
@@ -507,6 +507,7 @@ static const struct file_operations skel_fops = {
 	.open =		skel_open,
 	.release =	skel_release,
 	.flush =	skel_flush,
+	.llseek =	noop_llseek,
 };
 
 /*
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 29e850a7a2f9..c8523ce2e4c6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -869,6 +869,7 @@ static const struct file_operations vhost_net_fops = {
 	.compat_ioctl   = vhost_net_compat_ioctl,
 #endif
 	.open           = vhost_net_open,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice vhost_net_misc = {
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index b06647517c0e..42e303ff862a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1439,6 +1439,7 @@ static const struct file_operations fb_fops = {
 #ifdef CONFIG_FB_DEFERRED_IO
 	.fsync =	fb_deferred_io_fsync,
 #endif
+	.llseek =	default_llseek,
 };
 
 struct class *fb_class;
diff --git a/drivers/video/mbx/mbxdebugfs.c b/drivers/video/mbx/mbxdebugfs.c
index ecad96524570..12dec7634c55 100644
--- a/drivers/video/mbx/mbxdebugfs.c
+++ b/drivers/video/mbx/mbxdebugfs.c
@@ -175,36 +175,42 @@ static const struct file_operations sysconf_fops = {
 	.read = sysconf_read_file,
 	.write = write_file_dummy,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations clock_fops = {
 	.read = clock_read_file,
 	.write = write_file_dummy,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations display_fops = {
 	.read = display_read_file,
 	.write = write_file_dummy,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations gsctl_fops = {
 	.read = gsctl_read_file,
 	.write = write_file_dummy,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations sdram_fops = {
 	.read = sdram_read_file,
 	.write = write_file_dummy,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations misc_fops = {
 	.read = misc_read_file,
 	.write = write_file_dummy,
 	.open = open_file_generic,
+	.llseek = default_llseek,
 };
 
 static void __devinit mbxfb_debugfs_init(struct fb_info *fbi)
diff --git a/drivers/watchdog/ar7_wdt.c b/drivers/watchdog/ar7_wdt.c
index c764c52412e4..b29221783598 100644
--- a/drivers/watchdog/ar7_wdt.c
+++ b/drivers/watchdog/ar7_wdt.c
@@ -267,6 +267,7 @@ static const struct file_operations ar7_wdt_fops = {
 	.unlocked_ioctl	= ar7_wdt_ioctl,
 	.open		= ar7_wdt_open,
 	.release	= ar7_wdt_release,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice ar7_wdt_miscdev = {
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 566343b3c131..51ae4e84f4f9 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -524,6 +524,7 @@ static const struct file_operations cpwd_fops = {
 	.write =		cpwd_write,
 	.read =			cpwd_read,
 	.release =		cpwd_release,
+	.llseek =		no_llseek,
 };
 
 static int __devinit cpwd_probe(struct platform_device *op,
diff --git a/drivers/watchdog/ep93xx_wdt.c b/drivers/watchdog/ep93xx_wdt.c
index 59359c9a5e01..726b7df61fd0 100644
--- a/drivers/watchdog/ep93xx_wdt.c
+++ b/drivers/watchdog/ep93xx_wdt.c
@@ -188,6 +188,7 @@ static const struct file_operations ep93xx_wdt_fops = {
 	.unlocked_ioctl	= ep93xx_wdt_ioctl,
 	.open		= ep93xx_wdt_open,
 	.release	= ep93xx_wdt_release,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice ep93xx_wdt_miscdev = {
diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c
index 76b58abf4451..81e3d6100894 100644
--- a/drivers/watchdog/omap_wdt.c
+++ b/drivers/watchdog/omap_wdt.c
@@ -258,6 +258,7 @@ static const struct file_operations omap_wdt_fops = {
 	.unlocked_ioctl = omap_wdt_ioctl,
 	.open = omap_wdt_open,
 	.release = omap_wdt_release,
+	.llseek = no_llseek,
 };
 
 static int __devinit omap_wdt_probe(struct platform_device *pdev)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 66e185cfe92f..fec6ba3c08a8 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -467,6 +467,7 @@ static const struct file_operations evtchn_fops = {
 	.fasync  = evtchn_fasync,
 	.open    = evtchn_open,
 	.release = evtchn_release,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice evtchn_miscdev = {
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 78bfab0700ba..bd96340063c1 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -35,6 +35,7 @@ static ssize_t capabilities_read(struct file *file, char __user *buf,
 
 static const struct file_operations capabilities_file_ops = {
 	.read = capabilities_read,
+	.llseek = default_llseek,
 };
 
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
index 3b39c3752e21..1c1236087f78 100644
--- a/drivers/xen/xenfs/xenbus.c
+++ b/drivers/xen/xenfs/xenbus.c
@@ -594,4 +594,5 @@ const struct file_operations xenbus_file_ops = {
 	.open = xenbus_file_open,
 	.release = xenbus_file_release,
 	.poll = xenbus_file_poll,
+	.llseek = no_llseek,
 };
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6d552686c498..6153417caf57 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -29,6 +29,7 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
 const struct file_operations afs_mntpt_file_operations = {
 	.open		= afs_mntpt_open,
+	.llseek		= noop_llseek,
 };
 
 const struct inode_operations afs_mntpt_inode_operations = {
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22f..eff9a419469a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
 	.unlocked_ioctl	 = autofs_dev_ioctl,
 	.compat_ioctl = autofs_dev_ioctl_compat,
 	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd0cc0bf9a40..139fc8083f53 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -576,6 +576,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 static const struct file_operations bm_entry_operations = {
 	.read		= bm_entry_read,
 	.write		= bm_entry_write,
+	.llseek		= default_llseek,
 };
 
 /* /register */
@@ -643,6 +644,7 @@ out:
 
 static const struct file_operations bm_register_operations = {
 	.write		= bm_register_write,
+	.llseek		= noop_llseek,
 };
 
 /* /status */
@@ -680,6 +682,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 static const struct file_operations bm_status_operations = {
 	.read		= bm_status_read,
 	.write		= bm_status_write,
+	.llseek		= default_llseek,
 };
 
 /* Superblock handling */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..144f8a5730f5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -815,6 +815,7 @@ static const struct file_operations btrfs_ctl_fops = {
 	.unlocked_ioctl	 = btrfs_control_ioctl,
 	.compat_ioctl = btrfs_control_ioctl,
 	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice btrfs_misc = {
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 727caedcdd92..0a1467b15516 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
 	.read		= cachefiles_daemon_read,
 	.write		= cachefiles_daemon_write,
 	.poll		= cachefiles_daemon_poll,
+	.llseek		= noop_llseek,
 };
 
 struct cachefiles_daemon_cmd {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f80a4f25123c..e4a3318b812a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -454,6 +454,7 @@ static void cdev_purge(struct cdev *cdev)
  */
 const struct file_operations def_chr_fops = {
 	.open = chrdev_open,
+	.llseek = noop_llseek,
 };
 
 static struct kobject *exact_match(dev_t dev, int *part, void *data)
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c9..028a9a0f588b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -39,6 +39,7 @@ const struct inode_operations coda_ioctl_inode_operations = {
 const struct file_operations coda_ioctl_operations = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= coda_pioctl,
+	.llseek		= noop_llseek,
 };
 
 /* the coda pioctl inode ops */
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index de89645777c7..9fa280bfcffe 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -346,6 +346,7 @@ static const struct file_operations coda_psdev_fops = {
 	.unlocked_ioctl	= coda_psdev_ioctl,
 	.open		= coda_psdev_open,
 	.release	= coda_psdev_release,
+	.llseek		= noop_llseek,
 };
 
 static int init_coda_psdev(void)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b2..89d394d8fe24 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		default_open,
+	.llseek =	noop_llseek,
 };
 
 static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
 	.read =		read_file_bool,
 	.write =	write_file_bool,
 	.open =		default_open,
+	.llseek =	default_llseek,
 };
 
 /**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 static const struct file_operations fops_blob = {
 	.read =		read_file_blob,
 	.open =		default_open,
+	.llseek =	default_llseek,
 };
 
 /**
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf25158746..6b42ba807dfd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
 static const struct file_operations waiters_fops = {
 	.owner   = THIS_MODULE,
 	.open    = waiters_open,
-	.read    = waiters_read
+	.read    = waiters_read,
+	.llseek  = default_llseek,
 };
 
 void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db6943..30d8b85febbf 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
 	.read    = dev_read,
 	.write   = dev_write,
 	.poll    = dev_poll,
-	.owner   = THIS_MODULE
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130c..66d6c16bf440 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
 	.write   = device_write,
 	.poll    = device_poll,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
 	.read    = device_read,
 	.write   = device_write,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
 	.open    = monitor_device_open,
 	.release = monitor_device_close,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice monitor_device = {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..86e9da1c787e 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -332,6 +332,7 @@ const struct file_operations ecryptfs_dir_fops = {
 	.fsync = ecryptfs_fsync,
 	.fasync = ecryptfs_fasync,
 	.splice_read = generic_file_splice_read,
+	.llseek = default_llseek,
 };
 
 const struct file_operations ecryptfs_main_fops = {
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 00208c3d7e92..940a82e63dc3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
 	.read    = ecryptfs_miscdev_read,
 	.write   = ecryptfs_miscdev_write,
 	.release = ecryptfs_miscdev_release,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf88..e0194b3e14d6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
 	.poll		= eventfd_poll,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
+	.llseek		= noop_llseek,
 };
 
 /**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919cb..256bb7bb102a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -674,7 +674,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
-	.poll		= ep_eventpoll_poll
+	.poll		= ep_eventpoll_poll,
+	.llseek		= noop_llseek,
 };
 
 /* Fast test to see if the file is an evenpoll file */
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d2..4e303c22d5ee 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
  */
 const struct file_operations def_fifo_fops = {
 	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
+	.llseek		= noop_llseek,
 };
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f9..7367e177186f 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 static const struct file_operations fuse_ctl_abort_ops = {
 	.open = nonseekable_open,
 	.write = fuse_conn_abort_write,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations fuse_ctl_waiting_ops = {
 	.open = nonseekable_open,
 	.read = fuse_conn_waiting_read,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations fuse_conn_max_background_ops = {
 	.open = nonseekable_open,
 	.read = fuse_conn_max_background_read,
 	.write = fuse_conn_max_background_write,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations fuse_conn_congestion_threshold_ops = {
 	.open = nonseekable_open,
 	.read = fuse_conn_congestion_threshold_read,
 	.write = fuse_conn_congestion_threshold_write,
+	.llseek = no_llseek,
 };
 
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278bd..3e87cce5837d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
 	.unlocked_ioctl		= cuse_file_ioctl,
 	.compat_ioctl		= cuse_file_compat_ioctl,
 	.poll			= fuse_file_poll,
+	.llseek		= noop_llseek,
 };
 
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..55d25a68b496 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -771,6 +771,7 @@ const struct file_operations gfs2_dir_fops = {
 	.fsync		= gfs2_fsync,
 	.lock		= gfs2_lock,
 	.flock		= gfs2_flock,
+	.llseek		= default_llseek,
 };
 
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -797,5 +798,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
 	.open		= gfs2_open,
 	.release	= gfs2_close,
 	.fsync		= gfs2_fsync,
+	.llseek		= default_llseek,
 };
 
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7b027720d820..4e2a45ea6140 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -598,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.readdir	= hppfs_readdir,
 	.open		= hppfs_dir_open,
 	.fsync		= hppfs_fsync,
+	.llseek		= default_llseek,
 };
 
 static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..113eba3d3c38 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -674,6 +674,7 @@ const struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9777eb5b5522..1eb4e89e045b 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -827,4 +827,5 @@ const struct file_operations logfs_dir_fops = {
 	.unlocked_ioctl	= logfs_ioctl,
 	.readdir	= logfs_readdir,
 	.read		= generic_read_dir,
+	.llseek		= default_llseek,
 };
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..06fa87e52e82 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -137,6 +137,7 @@ static const struct file_operations transaction_ops = {
 	.write		= nfsctl_transaction_write,
 	.read		= nfsctl_transaction_read,
 	.release	= simple_transaction_release,
+	.llseek		= default_llseek,
 };
 
 static int exports_open(struct inode *inode, struct file *file)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d3467..6e40e42a43de 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
 
 const struct file_operations def_blk_fops = {
 	.open		= no_blkdev_open,
+	.llseek		= noop_llseek,
 };
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5ed8e58d7bfc..bbcb98e7fcc6 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -433,6 +433,7 @@ static const struct file_operations fanotify_fops = {
 	.release	= fanotify_release,
 	.unlocked_ioctl	= fanotify_ioctl,
 	.compat_ioctl	= fanotify_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bf7f6d776c31..24edc1185d53 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -344,6 +344,7 @@ static const struct file_operations inotify_fops = {
 	.release	= inotify_release,
 	.unlocked_ioctl	= inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
+	.llseek		= noop_llseek,
 };
 
 
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..a7ebd9d42dc8 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -612,6 +612,7 @@ static const struct file_operations dlmfs_file_operations = {
 	.poll		= dlmfs_file_poll,
 	.read		= dlmfs_file_read,
 	.write		= dlmfs_file_write,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations dlmfs_dir_inode_operations = {
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..0dbc6dae1de8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -628,6 +628,7 @@ static const struct file_operations ocfs2_control_fops = {
 	.read    = ocfs2_control_read,
 	.write   = ocfs2_control_write,
 	.owner   = THIS_MODULE,
+	.llseek  = default_llseek,
 };
 
 static struct miscdevice ocfs2_control_device = {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..bc307d7a5b76 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,6 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 static const struct file_operations proc_oom_score_adj_operations = {
 	.read		= oom_score_adj_read,
 	.write		= oom_score_adj_write,
+	.llseek		= default_llseek,
 };
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -2039,11 +2040,13 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 static const struct file_operations proc_fdinfo_file_operations = {
 	.open           = nonseekable_open,
 	.read		= proc_fdinfo_read,
+	.llseek		= no_llseek,
 };
 
 static const struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_readfd,
+	.llseek		= default_llseek,
 };
 
 /*
@@ -2112,6 +2115,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_fdinfo_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_readfdinfo,
+	.llseek		= default_llseek,
 };
 
 /*
@@ -2343,6 +2347,7 @@ static int proc_attr_dir_readdir(struct file * filp,
 static const struct file_operations proc_attr_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_attr_dir_readdir,
+	.llseek		= default_llseek,
 };
 
 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2751,6 +2756,7 @@ static int proc_tgid_base_readdir(struct file * filp,
 static const struct file_operations proc_tgid_base_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_tgid_base_readdir,
+	.llseek		= default_llseek,
 };
 
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -3088,6 +3094,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 static const struct file_operations proc_tid_base_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_tid_base_readdir,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3324,4 +3331,5 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_task_readdir,
+	.llseek		= default_llseek,
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5be436ea088e..2fc52552271d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -364,6 +364,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 static const struct file_operations proc_sys_file_operations = {
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_sys_dir_file_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22d..93d99b316325 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -179,6 +179,7 @@ static int proc_root_readdir(struct file * filp,
 static const struct file_operations proc_root_operations = {
 	.read		 = generic_read_dir,
 	.readdir	 = proc_root_readdir,
+	.llseek		= default_llseek,
 };
 
 /*
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 271afc48b9a5..aa56920df335 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -539,6 +539,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 
 const struct file_operations proc_clear_refs_operations = {
 	.write		= clear_refs_write,
+	.llseek		= noop_llseek,
 };
 
 struct pagemapread {
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d213546894..268580535c92 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
 static const struct file_operations romfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= romfs_readdir,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations romfs_dir_inode_operations = {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..74047304b01a 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -206,6 +206,7 @@ static const struct file_operations signalfd_fops = {
 	.release	= signalfd_release,
 	.poll		= signalfd_poll,
 	.read		= signalfd_read,
+	.llseek		= noop_llseek,
 };
 
 SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac6585..0dc340aa2be9 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
 
 const struct file_operations squashfs_dir_ops = {
 	.read = generic_read_dir,
-	.readdir = squashfs_readdir
+	.readdir = squashfs_readdir,
+	.llseek = default_llseek,
 };
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79a..8c4fc1425b3e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
 	.poll		= timerfd_poll,
 	.read		= timerfd_read,
+	.llseek		= noop_llseek,
 };
 
 static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..c6c553fd0b3d 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2625,6 +2625,7 @@ static const struct file_operations dfs_fops = {
 	.open = open_debugfs_file,
 	.write = write_debugfs_file,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 /**
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c60e519e2917..e1e7b9635f5d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1219,6 +1219,7 @@ static const struct file_operations mqueue_file_operations = {
 	.flush = mqueue_flush_file,
 	.poll = mqueue_poll_file,
 	.read = mqueue_read_file,
+	.llseek = default_llseek,
 };
 
 static const struct super_operations mqueue_super_ops = {
diff --git a/ipc/shm.c b/ipc/shm.c
index 52ed77eb9713..7bc46a9fe1f8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -298,6 +298,7 @@ static const struct file_operations shm_file_operations = {
 #ifndef CONFIG_MMU
 	.get_unmapped_area	= shm_get_unmapped_area,
 #endif
+	.llseek		= noop_llseek,
 };
 
 static const struct file_operations shm_file_operations_huge = {
@@ -305,6 +306,7 @@ static const struct file_operations shm_file_operations_huge = {
 	.fsync		= shm_fsync,
 	.release	= shm_release,
 	.get_unmapped_area	= shm_get_unmapped_area,
+	.llseek		= noop_llseek,
 };
 
 int is_file_shm_hugepages(struct file *file)
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
 static const struct file_operations ikconfig_file_ops = {
 	.owner = THIS_MODULE,
 	.read = ikconfig_read_current,
+	.llseek = default_llseek,
 };
 
 static int __init ikconfig_init(void)
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index f83972b16564..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
 static const struct file_operations gcov_reset_fops = {
 	.write	= reset_write,
 	.read	= reset_read,
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..8b5ff2655ae0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1992,6 +1992,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 static const struct file_operations fops_kp = {
 	.read =         read_enabled_file_bool,
 	.write =        write_enabled_file_bool,
+	.llseek =	default_llseek,
 };
 
 static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..a96b850ba08a 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = {
 	.write = pm_qos_power_write,
 	.open = pm_qos_power_open,
 	.release = pm_qos_power_release,
+	.llseek = noop_llseek,
 };
 
 /* unlocked internal variant */
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 static const struct file_operations proc_profile_operations = {
 	.read		= read_profile,
 	.write		= write_profile,
+	.llseek		= default_llseek,
 };
 
 #ifdef CONFIG_SMP
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..2d5f3a757316 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -326,6 +326,7 @@ static const struct file_operations blk_dropped_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_dropped_open,
 	.read =		blk_dropped_read,
+	.llseek =	default_llseek,
 };
 
 static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +366,7 @@ static const struct file_operations blk_msg_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_msg_open,
 	.write =	blk_msg_write,
+	.llseek =	noop_llseek,
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa7ece649fe1..5e1ad4763090 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
 	.open		= tracing_open_generic,
 	.read		= ftrace_profile_read,
 	.write		= ftrace_profile_write,
+	.llseek		= default_llseek,
 };
 
 /* used to initialize the real stat files */
@@ -2632,6 +2633,7 @@ static const struct file_operations ftrace_graph_fops = {
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
 	.release	= ftrace_graph_release,
+	.llseek		= seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 492197e2f86c..3aea966d16de 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3965,6 +3965,7 @@ static const struct file_operations rb_simple_fops = {
 	.open		= tracing_open_generic,
 	.read		= rb_simple_read,
 	.write		= rb_simple_write,
+	.llseek		= default_llseek,
 };
 
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..0369c5e09984 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -951,6 +951,7 @@ static const struct file_operations ftrace_enable_fops = {
 	.open = tracing_open_generic,
 	.read = event_enable_read,
 	.write = event_enable_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_event_format_fops = {
@@ -963,29 +964,34 @@ static const struct file_operations ftrace_event_format_fops = {
 static const struct file_operations ftrace_event_id_fops = {
 	.open = tracing_open_generic,
 	.read = event_id_read,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_event_filter_fops = {
 	.open = tracing_open_generic,
 	.read = event_filter_read,
 	.write = event_filter_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_subsystem_filter_fops = {
 	.open = tracing_open_generic,
 	.read = subsystem_filter_read,
 	.write = subsystem_filter_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_system_enable_fops = {
 	.open = tracing_open_generic,
 	.read = system_enable_read,
 	.write = system_enable_write,
+	.llseek = default_llseek,
 };
 
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
+	.llseek = default_llseek,
 };
 
 static struct dentry *event_trace_events_dir(void)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
 	.open		= tracing_open_generic,
 	.read		= stack_max_size_read,
 	.write		= stack_max_size_write,
+	.llseek		= default_llseek,
 };
 
 static void *
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 01e64270e246..4bfb0471f106 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -590,6 +590,7 @@ out_unlock:
 static const struct file_operations filter_fops = {
 	.read  = filter_read,
 	.write = filter_write,
+	.llseek = default_llseek,
 };
 
 static int dma_debug_fs_init(void)
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 6262aeae398e..f85da0779e5e 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -38,6 +38,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
 static const struct file_operations proc_atm_dev_ops = {
 	.owner =	THIS_MODULE,
 	.read =		proc_dev_atm_read,
+	.llseek =	noop_llseek,
 };
 
 static void add_stats(struct seq_file *seq, const char *aal,
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 078e48d442fd..33d0e6297c21 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -149,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = dccpprobe_open,
 	.read    = dccpprobe_read,
+	.llseek  = noop_llseek,
 };
 
 static __init int dccpprobe_init(void)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e8..6211e2114173 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = tcpprobe_open,
 	.read    = tcpprobe_read,
+	.llseek  = noop_llseek,
 };
 
 static __init int tcpprobe_init(void)
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 4a4d35c750c6..b8b0ae79a743 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -102,7 +102,8 @@ static ssize_t tsf_write(struct file *file,
 static const struct file_operations tsf_ops = {
 	.read = tsf_read,
 	.write = tsf_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t reset_write(struct file *file, const char __user *user_buf,
@@ -121,6 +122,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
 static const struct file_operations reset_ops = {
 	.write = reset_write,
 	.open = mac80211_open_file_generic,
+	.llseek = noop_llseek,
 };
 
 static ssize_t noack_read(struct file *file, char __user *user_buf,
@@ -156,7 +158,8 @@ static ssize_t noack_write(struct file *file,
 static const struct file_operations noack_ops = {
 	.read = noack_read,
 	.write = noack_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf,
@@ -202,7 +205,8 @@ static ssize_t uapsd_queues_write(struct file *file,
 static const struct file_operations uapsd_queues_ops = {
 	.read = uapsd_queues_read,
 	.write = uapsd_queues_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf,
@@ -248,7 +252,8 @@ static ssize_t uapsd_max_sp_len_write(struct file *file,
 static const struct file_operations uapsd_max_sp_len_ops = {
 	.read = uapsd_max_sp_len_read,
 	.write = uapsd_max_sp_len_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t channel_type_read(struct file *file, char __user *user_buf,
@@ -280,7 +285,8 @@ static ssize_t channel_type_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations channel_type_ops = {
 	.read = channel_type_read,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t queues_read(struct file *file, char __user *user_buf,
@@ -303,7 +309,8 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations queues_ops = {
 	.read = queues_read,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 /* statistics stuff */
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index be04d46110fe..334cbd3d2aae 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -145,6 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,
 static const struct file_operations rcname_ops = {
 	.read = rcname_read,
 	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 #endif
 
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 241e76f3fdf2..a290ad231d77 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -122,6 +122,7 @@ static const struct file_operations minstrel_stat_fops = {
 	.open = minstrel_stats_open,
 	.read = minstrel_stats_read,
 	.release = minstrel_stats_release,
+	.llseek = default_llseek,
 };
 
 void
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index 47438b4a9af5..7905f79cc2e4 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -206,6 +206,7 @@ static const struct file_operations rc_pid_fop_events = {
 	.poll = rate_control_pid_events_poll,
 	.open = rate_control_pid_events_open,
 	.release = rate_control_pid_events_release,
+	.llseek = noop_llseek,
 };
 
 void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 76aec6a44762..d2ff15a2412b 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -567,6 +567,7 @@ static const struct file_operations recent_mt_fops = {
 	.write   = recent_mt_proc_write,
 	.release = seq_release_private,
 	.owner   = THIS_MODULE,
+	.llseek = seq_lseek,
 };
 
 static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/nonet.c b/net/nonet.c
index 92e76640c7cd..b1a73fda9c12 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -22,4 +22,5 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
+	.llseek = noop_llseek,
 };
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 51875a0c5d48..04f599089e6d 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1241,6 +1241,7 @@ static const struct file_operations rfkill_fops = {
 	.unlocked_ioctl	= rfkill_fop_ioctl,
 	.compat_ioctl	= rfkill_fop_ioctl,
 #endif
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice rfkill_miscdev = {
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index db3a42b8b349..289b1ba62cac 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -117,6 +117,7 @@ static const struct file_operations sctpprobe_fops = {
 	.owner	= THIS_MODULE,
 	.open	= sctpprobe_open,
 	.read	= sctpprobe_read,
+	.llseek = noop_llseek,
 };
 
 sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
diff --git a/net/socket.c b/net/socket.c
index 2270b941bcc7..9eac5c394134 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -502,6 +502,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
+	.llseek = noop_llseek,
 };
 
 /**
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2b06410e584e..6bc692582de5 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1441,6 +1441,7 @@ static const struct file_operations cache_flush_operations_procfs = {
 	.read		= read_flush_procfs,
 	.write		= write_flush_procfs,
 	.release	= release_flush_procfs,
+	.llseek		= no_llseek,
 };
 
 static void remove_cache_proc_entries(struct cache_detail *cd)
@@ -1646,6 +1647,7 @@ const struct file_operations cache_flush_operations_pipefs = {
 	.read		= read_flush_pipefs,
 	.write		= write_flush_pipefs,
 	.release	= release_flush_pipefs,
+	.llseek		= no_llseek,
 };
 
 int sunrpc_cache_register_pipefs(struct dentry *parent,
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 3f9a57e96508..39765bcfb472 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -103,6 +103,7 @@ static ssize_t ht40allow_map_read(struct file *file,
 static const struct file_operations ht40allow_map_ops = {
 	.read = ht40allow_map_read,
 	.open = cfg80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 #define DEBUGFS_ADD(name)						\
diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c
index 178061e87ffe..cfe40addda76 100644
--- a/samples/kfifo/bytestream-example.c
+++ b/samples/kfifo/bytestream-example.c
@@ -148,6 +148,7 @@ static const struct file_operations fifo_fops = {
 	.owner		= THIS_MODULE,
 	.read		= fifo_read,
 	.write		= fifo_write,
+	.llseek		= noop_llseek,
 };
 
 static int __init example_init(void)
diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c
index 71b2aabca96a..6f8e79e76c9e 100644
--- a/samples/kfifo/inttype-example.c
+++ b/samples/kfifo/inttype-example.c
@@ -141,6 +141,7 @@ static const struct file_operations fifo_fops = {
 	.owner		= THIS_MODULE,
 	.read		= fifo_read,
 	.write		= fifo_write,
+	.llseek		= noop_llseek,
 };
 
 static int __init example_init(void)
diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c
index e68bd16a5da4..2d7529eeb294 100644
--- a/samples/kfifo/record-example.c
+++ b/samples/kfifo/record-example.c
@@ -155,6 +155,7 @@ static const struct file_operations fifo_fops = {
 	.owner		= THIS_MODULE,
 	.read		= fifo_read,
 	.write		= fifo_write,
+	.llseek		= noop_llseek,
 };
 
 static int __init example_init(void)
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 26fab33ffa8c..f4d89e008c32 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -30,6 +30,7 @@ static int my_open(struct inode *inode, struct file *file)
 
 static const struct file_operations mark_ops = {
 	.open = my_open,
+	.llseek = noop_llseek,
 };
 
 static int __init sample_init(void)
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 7320331b44ab..a27086d16f05 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -86,7 +86,8 @@ static ssize_t profile_load(struct file *f, const char __user *buf, size_t size,
 }
 
 static const struct file_operations aa_fs_profile_load = {
-	.write = profile_load
+	.write = profile_load,
+	.llseek = default_llseek,
 };
 
 /* .replace file hook fn to load and/or replace policy */
@@ -107,7 +108,8 @@ static ssize_t profile_replace(struct file *f, const char __user *buf,
 }
 
 static const struct file_operations aa_fs_profile_replace = {
-	.write = profile_replace
+	.write = profile_replace,
+	.llseek = default_llseek,
 };
 
 /* .remove file hook fn to remove loaded policy */
@@ -134,7 +136,8 @@ static ssize_t profile_remove(struct file *f, const char __user *buf,
 }
 
 static const struct file_operations aa_fs_profile_remove = {
-	.write = profile_remove
+	.write = profile_remove,
+	.llseek = default_llseek,
 };
 
 /** Base file system setup **/
diff --git a/security/inode.c b/security/inode.c
index 8c777f022ad1..88839866cbcd 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -53,6 +53,7 @@ static const struct file_operations default_file_ops = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		default_open,
+	.llseek =	noop_llseek,
 };
 
 static struct inode *get_inode(struct super_block *sb, int mode, dev_t dev)
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index a2b72d77f926..7512502d0162 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -968,6 +968,7 @@ static ssize_t smk_write_doi(struct file *file, const char __user *buf,
 static const struct file_operations smk_doi_ops = {
 	.read		= smk_read_doi,
 	.write		= smk_write_doi,
+	.llseek		= default_llseek,
 };
 
 /**
@@ -1031,6 +1032,7 @@ static ssize_t smk_write_direct(struct file *file, const char __user *buf,
 static const struct file_operations smk_direct_ops = {
 	.read		= smk_read_direct,
 	.write		= smk_write_direct,
+	.llseek		= default_llseek,
 };
 
 /**
@@ -1112,6 +1114,7 @@ static ssize_t smk_write_ambient(struct file *file, const char __user *buf,
 static const struct file_operations smk_ambient_ops = {
 	.read		= smk_read_ambient,
 	.write		= smk_write_ambient,
+	.llseek		= default_llseek,
 };
 
 /**
@@ -1191,6 +1194,7 @@ static ssize_t smk_write_onlycap(struct file *file, const char __user *buf,
 static const struct file_operations smk_onlycap_ops = {
 	.read		= smk_read_onlycap,
 	.write		= smk_write_onlycap,
+	.llseek		= default_llseek,
 };
 
 /**
@@ -1255,6 +1259,7 @@ static ssize_t smk_write_logging(struct file *file, const char __user *buf,
 static const struct file_operations smk_logging_ops = {
 	.read		= smk_read_logging,
 	.write		= smk_write_logging,
+	.llseek		= default_llseek,
 };
 /**
  * smk_fill_super - fill the /smackfs superblock
diff --git a/sound/core/seq/oss/seq_oss.c b/sound/core/seq/oss/seq_oss.c
index f25e3cc7ddfa..a1f1a2f00ccb 100644
--- a/sound/core/seq/oss/seq_oss.c
+++ b/sound/core/seq/oss/seq_oss.c
@@ -220,6 +220,7 @@ static const struct file_operations seq_oss_f_ops =
 	.poll =		odev_poll,
 	.unlocked_ioctl =	odev_ioctl,
 	.compat_ioctl =	odev_ioctl_compat,
+	.llseek =	noop_llseek,
 };
 
 static int __init
diff --git a/sound/core/sound.c b/sound/core/sound.c
index ac42af42b787..62a093efb453 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -184,7 +184,8 @@ static int snd_open(struct inode *inode, struct file *file)
 static const struct file_operations snd_fops =
 {
 	.owner =	THIS_MODULE,
-	.open =		snd_open
+	.open =		snd_open,
+	.llseek =	noop_llseek,
 };
 
 #ifdef CONFIG_SND_DYNAMIC_MINORS
diff --git a/sound/oss/msnd_pinnacle.c b/sound/oss/msnd_pinnacle.c
index 2e48b17667d0..ca942f7cd231 100644
--- a/sound/oss/msnd_pinnacle.c
+++ b/sound/oss/msnd_pinnacle.c
@@ -1117,6 +1117,7 @@ static const struct file_operations dev_fileops = {
 	.unlocked_ioctl	= dev_ioctl,
 	.open		= dev_open,
 	.release	= dev_release,
+	.llseek		= noop_llseek,
 };
 
 static int reset_dsp(void)
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index acc91daa1c55..4057d35343bb 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -223,6 +223,7 @@ static const struct file_operations codec_reg_fops = {
 	.open = codec_reg_open_file,
 	.read = codec_reg_read_file,
 	.write = codec_reg_write_file,
+	.llseek = default_llseek,
 };
 
 static void soc_init_codec_debugfs(struct snd_soc_codec *codec)
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 03cb7c05ebec..72a53d0a41e9 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1089,6 +1089,7 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 static const struct file_operations dapm_widget_power_fops = {
 	.open = dapm_widget_power_open_file,
 	.read = dapm_widget_power_read_file,
+	.llseek = default_llseek,
 };
 
 void snd_soc_dapm_debugfs_init(struct snd_soc_codec *codec)
diff --git a/sound/sound_core.c b/sound/sound_core.c
index cb61317df509..c03bbaefdbc3 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -165,6 +165,7 @@ static const struct file_operations soundcore_fops =
 	/* We must have an owner or the module locking fails */
 	.owner	= THIS_MODULE,
 	.open	= soundcore_open,
+	.llseek = noop_llseek,
 };
 
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d4853a54771a..e039f641d66b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1305,6 +1305,7 @@ static struct file_operations kvm_vcpu_fops = {
 	.unlocked_ioctl = kvm_vcpu_ioctl,
 	.compat_ioctl   = kvm_vcpu_ioctl,
 	.mmap           = kvm_vcpu_mmap,
+	.llseek		= noop_llseek,
 };
 
 /*
@@ -1774,6 +1775,7 @@ static struct file_operations kvm_vm_fops = {
 	.compat_ioctl   = kvm_vm_compat_ioctl,
 #endif
 	.mmap           = kvm_vm_mmap,
+	.llseek		= noop_llseek,
 };
 
 static int kvm_dev_ioctl_create_vm(void)
@@ -1867,6 +1869,7 @@ out:
 static struct file_operations kvm_chardev_ops = {
 	.unlocked_ioctl = kvm_dev_ioctl,
 	.compat_ioctl   = kvm_dev_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice kvm_dev = {
-- 
cgit v1.2.3


From ab91261f5c43f196ec7ff1d113847b87b7606b26 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 7 Jul 2010 22:55:17 +0200
Subject: vfs: don't use BKL in default_llseek

There are currently 191 users of default_llseek.
Nine of these are in device drivers that use the
big kernel lock. None of these ever touch
file->f_pos outside of llseek or file_pos_write.

Consequently, we never rely on the BKL
in the default_llseek function and can
replace that with i_mutex, which is also
used in generic_file_llseek.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/read_write.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 74e36586e4d3..fd09f6166ccf 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -124,7 +124,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t retval;
 
-	lock_kernel();
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch (origin) {
 		case SEEK_END:
 			offset += i_size_read(file->f_path.dentry->d_inode);
@@ -145,7 +145,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 		retval = offset;
 	}
 out:
-	unlock_kernel();
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return retval;
 }
 EXPORT_SYMBOL(default_llseek);
-- 
cgit v1.2.3


From 776c163b1b93c8dfa5edba885bc2bfbc2d228a5f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 7 Jul 2010 23:10:11 +0200
Subject: vfs: make no_llseek the default

All file operations now have an explicit .llseek
operation pointer, so we can change the default
action for future code.

This makes changes the default from default_llseek
to no_llseek, which always returns -ESPIPE if
a user tries to seek on a file without a .llseek
operation.

The name of the default_llseek function remains
unchanged, if anyone thinks we should change it,
please speak up.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/read_write.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index fd09f6166ccf..e757ef26e4ce 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -156,7 +156,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 
 	fn = no_llseek;
 	if (file->f_mode & FMODE_LSEEK) {
-		fn = default_llseek;
 		if (file->f_op && file->f_op->llseek)
 			fn = file->f_op->llseek;
 	}
-- 
cgit v1.2.3


From 6221ddd0f5e2ddc1d5d928119a2cde033d16f96d Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Fri, 1 Oct 2010 21:23:33 +0530
Subject: cifs: handle FindFirst failure gracefully

FindFirst failure due to permission errors or any other errors are silently
ignored by cifs_readdir(). This could cause problem to applications that depend
on the error to do further processing.

Reproducer:
  - mount a cifs share
  - mkdir tdir;touch tdir/1 tdir/2 tdir/3
  - chmod -x tdir
  - ls tdir

Currently, we start calling filldir() for '.' and '..' before we know we
whether FindFirst could succeed or not. If FindFirst fails later, there is no
way to notify VFS by setting buf.error and so VFS won't be able to catch this.
Fix this by moving the call to initiate_cifs_search() before we start doing
filldir().

This fixes https://bugzilla.samba.org/show_bug.cgi?id=7535

Reported-by: Tom Dexter <digitalaudiorock@gmail.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 1f0bd0f972d4..6f3d13ff9470 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -790,6 +790,17 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
+	/*
+	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
+	 * '..'. Otherwise we won't be able to notify VFS in case of failure.
+	 */
+	if (file->private_data == NULL) {
+		rc = initiate_cifs_search(xid, file);
+		cFYI(1, "initiate cifs search rc %d", rc);
+		if (rc)
+			goto rddir2_exit;
+	}
+
 	switch ((int) file->f_pos) {
 	case 0:
 		if (filldir(direntry, ".", 1, file->f_pos,
@@ -813,14 +824,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			if it before then restart search
 			if after then keep searching till find it */
 
-		if (file->private_data == NULL) {
-			rc = initiate_cifs_search(xid, file);
-			cFYI(1, "initiate cifs search rc %d", rc);
-			if (rc) {
-				FreeXid(xid);
-				return rc;
-			}
-		}
 		if (file->private_data == NULL) {
 			rc = -EINVAL;
 			FreeXid(xid);
-- 
cgit v1.2.3


From 2f4f26fcf393ef4a44abe10e79c1966e64e86055 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 13 Oct 2010 18:50:39 -0400
Subject: cifs: eliminate cifs_posix_open_inode_helper

cifs: eliminate cifs_posix_open_inode_helper

This function is redundant. The only thing it does is set the canCache
flags, but those get set in cifs_new_fileinfo anyway.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 67 ----------------------------------------------------------
 1 file changed, 67 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fd78a355f634..c10f0851cd21 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -106,64 +106,6 @@ static inline int cifs_get_disposition(unsigned int flags)
 		return FILE_OPEN;
 }
 
-/* all arguments to this function must be checked for validity in caller */
-static inline int
-cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-			     struct cifsInodeInfo *pCifsInode, __u32 oplock,
-			     u16 netfid)
-{
-
-	write_lock(&GlobalSMBSeslock);
-
-	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-	if (pCifsInode == NULL) {
-		write_unlock(&GlobalSMBSeslock);
-		return -EINVAL;
-	}
-
-	if (pCifsInode->clientCanCacheRead) {
-		/* we have the inode open somewhere else
-		   no need to discard cache data */
-		goto psx_client_can_cache;
-	}
-
-	/* BB FIXME need to fix this check to move it earlier into posix_open
-	   BB  fIX following section BB FIXME */
-
-	/* if not oplocked, invalidate inode pages if mtime or file
-	   size changed */
-/*	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
-	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
-			   (file->f_path.dentry->d_inode->i_size ==
-			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, "inode unchanged on server");
-	} else {
-		if (file->f_path.dentry->d_inode->i_mapping) {
-			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
-			if (rc != 0)
-				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
-		}
-		cFYI(1, "invalidating remote inode since open detected it "
-			 "changed");
-		invalidate_remote_inode(file->f_path.dentry->d_inode);
-	} */
-
-psx_client_can_cache:
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = true;
-		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p",
-			 file->f_path.dentry->d_inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ)
-		pCifsInode->clientCanCacheRead = true;
-
-	/* will have to change the unlock if we reenable the
-	   filemap_fdatawrite (which does not seem necessary */
-	write_unlock(&GlobalSMBSeslock);
-	return 0;
-}
-
-/* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode,
 	struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
 	char *full_path, int xid)
@@ -271,15 +213,6 @@ int cifs_open(struct inode *inode, struct file *file)
 				oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
 			cFYI(1, "posix open succeeded");
-			/* no need for special case handling of setting mode
-			   on read only files needed here */
-
-			rc = cifs_posix_open_inode_helper(inode, file,
-					pCifsInode, oplock, netfid);
-			if (rc != 0) {
-				CIFSSMBClose(xid, tcon, netfid);
-				goto out;
-			}
 
 			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
 							tlink, oflags, oplock);
-- 
cgit v1.2.3


From d4396eafe402b710a8535137b3bf2abe6c059a15 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 15 Oct 2010 11:57:21 -0700
Subject: ocfs2/cluster: Release debugfs file elapsed_time_in_ms

An earlier commit forgot to remove a debugfs file, elapsed_time_in_ms.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6a1280a013ea..52c7557f3e25 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1384,6 +1384,7 @@ static void o2hb_region_release(struct config_item *item)
 	kfree(reg->hr_db_livenodes);
 	debugfs_remove(reg->hr_debug_livenodes);
 	debugfs_remove(reg->hr_debug_regnum);
+	debugfs_remove(reg->hr_debug_elapsed_time);
 	debugfs_remove(reg->hr_debug_dir);
 
 	spin_lock(&o2hb_live_lock);
-- 
cgit v1.2.3


From 2decd65a2630633cee04d0b83fdcee46ad2989a1 Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Tue, 12 Oct 2010 11:18:18 +0800
Subject: ocfs2: Avoid to evaluate xattr block flags again.

It was evaludated to indexed before, check it is ok i think.

Signed-off-by: Jeff Liu <jeff.liu@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 06fa5e77c40e..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
 		goto out;
 	}
 
-	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+	if (!indexed)
 		ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
 	else
 		ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
-- 
cgit v1.2.3


From 69d44ffbd772bede8c2a6d182e6e14f94826520b Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sat, 25 Sep 2010 23:34:22 +0200
Subject: sysfs: Add sysfs_merge_group() and sysfs_unmerge_group()

This patch (as1420) adds sysfs_merge_group() and sysfs_unmerge_group()
functions, allowing drivers easily to add and remove sets of
attributes to a pre-existing attribute group directory.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 fs/sysfs/group.c      | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h | 15 +++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792a..442f34ff1af8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj,
 	sysfs_put(sd);
 }
 
+/**
+ * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to create and the attribute group they belong to.
+ *
+ * This function returns an error if the group doesn't exist or any of the
+ * files already exist in that group, in which case none of the new files
+ * are created.
+ */
+int sysfs_merge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct sysfs_dirent *dir_sd;
+	int error = 0;
+	struct attribute *const *attr;
+	int i;
+
+	if (grp)
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+	if (!dir_sd)
+		return -ENOENT;
+
+	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
+		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+	if (error) {
+		while (--i >= 0)
+			sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+	}
+	sysfs_put(dir_sd);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_merge_group);
+
+/**
+ * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to remove and the attribute group they belong to.
+ */
+void sysfs_unmerge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct sysfs_dirent *dir_sd;
+	struct attribute *const *attr;
+
+	if (grp)
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+	if (dir_sd) {
+		for (attr = grp->attrs; *attr; ++attr)
+			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+		sysfs_put(dir_sd);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+
 
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 96eb576d82fd..30b881555fa5 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -164,6 +164,10 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 			const struct attribute *attr, const char *group);
 void sysfs_remove_file_from_group(struct kobject *kobj,
 			const struct attribute *attr, const char *group);
+int sysfs_merge_group(struct kobject *kobj,
+		       const struct attribute_group *grp);
+void sysfs_unmerge_group(struct kobject *kobj,
+		       const struct attribute_group *grp);
 
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
 void sysfs_notify_dirent(struct sysfs_dirent *sd);
@@ -302,6 +306,17 @@ static inline void sysfs_remove_file_from_group(struct kobject *kobj,
 {
 }
 
+static inline int sysfs_merge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	return 0;
+}
+
+static inline void sysfs_unmerge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+}
+
 static inline void sysfs_notify(struct kobject *kobj, const char *dir,
 				const char *attr)
 {
-- 
cgit v1.2.3


From 7d08ae3c9205b559f90c3d7a3abba3c6479673c7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 17 Oct 2010 15:50:19 +0300
Subject: UBIFS: add a commentary about log recovery

Add a commentary which elaborates that 'ubifs_recover_log_leb()' recovers only
the last log LEB, not any. Also remove some unneeded newlines.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/recovery.c | 3 ++-
 fs/ubifs/replay.c   | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c902a5de90ae..77e9b874b6c2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -772,7 +772,8 @@ out_free:
  * @sbuf: LEB-sized buffer to use
  *
  * This function does a scan of a LEB, but caters for errors that might have
- * been caused by the unclean unmount from which we are attempting to recover.
+ * been caused by unclean reboots from which we are attempting to recover
+ * (assume that only the last log LEB can be corrupted by an unclean reboot).
  *
  * This function returns %0 on success and a negative error code on failure.
  */
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 730598cf6342..7df04ba4878e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -839,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 	if (IS_ERR(sleb)) {
 		if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
 			return PTR_ERR(sleb);
+		/*
+		 * Note, the below function will recover this log LEB only if
+		 * it is the last, because unclean reboots can possibly corrupt
+		 * only the tail of the log.
+		 */
 		sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
 		if (IS_ERR(sleb))
 			return PTR_ERR(sleb);
@@ -850,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 	}
 
 	node = sleb->buf;
-
 	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
 	if (c->cs_sqnum == 0) {
 		/*
@@ -897,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 	}
 
 	list_for_each_entry(snod, &sleb->nodes, list) {
-
 		cond_resched();
 
 		if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1030,9 +1033,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
 		return -ENOMEM;
 
 	dbg_mnt("start replaying the journal");
-
 	c->replaying = 1;
-
 	lnum = c->ltail_lnum = c->lhead_lnum;
 	offs = c->lhead_offs;
 
-- 
cgit v1.2.3


From 608712fe8609492a8670638ea86b97fafe49ebba Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:33:56 -0400
Subject: cifs: fix flags handling in cifs_posix_open

The way flags are passed and converted for cifs_posix_open is rather
non-sensical. Some callers call cifs_posix_convert_flags on the flags
before they pass them to cifs_posix_open, whereas some don't. Two flag
conversion steps is just confusing though.

Change the function instead to clearly expect input in f_flags format,
and fix the callers to pass that in. Then, have cifs_posix_open call
cifs_convert_posix_flags to do the conversion. Move cifs_posix_open to
file.c as well so we can keep cifs_convert_posix_flags as a static
function.

Fix it also to not ignore O_CREAT, O_EXCL and O_TRUNC, and instead have
cifs_reopen_file mask those bits off before calling cifs_posix_open.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |   2 +-
 fs/cifs/dir.c       |  99 +++------------------------------------------
 fs/cifs/file.c      | 113 ++++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 95 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7f416abd34cf..4f7edbaf0118 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -111,7 +111,7 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				unsigned int oflags, __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
-				int mode, int oflags,
+				int mode, unsigned int f_flags,
 				__u32 *poplock, __u16 *pnetfid, int xid);
 void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index c205ec9293ea..8c1af7128384 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -181,93 +181,6 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 	return pCifsFile;
 }
 
-int cifs_posix_open(char *full_path, struct inode **pinode,
-			struct super_block *sb, int mode, int oflags,
-			__u32 *poplock, __u16 *pnetfid, int xid)
-{
-	int rc;
-	FILE_UNIX_BASIC_INFO *presp_data;
-	__u32 posix_flags = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifs_fattr fattr;
-	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
-
-	cFYI(1, "posix open %s", full_path);
-
-	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-	if (presp_data == NULL)
-		return -ENOMEM;
-
-/* So far cifs posix extensions can only map the following flags.
-   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
-   so far we do not seem to need them, and we can treat them as local only */
-	if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
-		(FMODE_READ | FMODE_WRITE))
-		posix_flags = SMB_O_RDWR;
-	else if (oflags & FMODE_READ)
-		posix_flags = SMB_O_RDONLY;
-	else if (oflags & FMODE_WRITE)
-		posix_flags = SMB_O_WRONLY;
-	if (oflags & O_CREAT)
-		posix_flags |= SMB_O_CREAT;
-	if (oflags & O_EXCL)
-		posix_flags |= SMB_O_EXCL;
-	if (oflags & O_TRUNC)
-		posix_flags |= SMB_O_TRUNC;
-	/* be safe and imply O_SYNC for O_DSYNC */
-	if (oflags & O_DSYNC)
-		posix_flags |= SMB_O_SYNC;
-	if (oflags & O_DIRECTORY)
-		posix_flags |= SMB_O_DIRECTORY;
-	if (oflags & O_NOFOLLOW)
-		posix_flags |= SMB_O_NOFOLLOW;
-	if (oflags & O_DIRECT)
-		posix_flags |= SMB_O_DIRECT;
-
-	mode &= ~current_umask();
-
-	tlink = cifs_sb_tlink(cifs_sb);
-	if (IS_ERR(tlink)) {
-		rc = PTR_ERR(tlink);
-		goto posix_open_ret;
-	}
-
-	tcon = tlink_tcon(tlink);
-	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
-			     poplock, full_path, cifs_sb->local_nls,
-			     cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-	cifs_put_tlink(tlink);
-
-	if (rc)
-		goto posix_open_ret;
-
-	if (presp_data->Type == cpu_to_le32(-1))
-		goto posix_open_ret; /* open ok, caller does qpathinfo */
-
-	if (!pinode)
-		goto posix_open_ret; /* caller does not need info */
-
-	cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
-
-	/* get new inode and set it up */
-	if (*pinode == NULL) {
-		cifs_fill_uniqueid(sb, &fattr);
-		*pinode = cifs_iget(sb, &fattr);
-		if (!*pinode) {
-			rc = -ENOMEM;
-			goto posix_open_ret;
-		}
-	} else {
-		cifs_fattr_to_inode(*pinode, &fattr);
-	}
-
-posix_open_ret:
-	kfree(presp_data);
-	return rc;
-}
-
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
@@ -321,9 +234,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		oplock = REQ_OPLOCK;
 
 	if (nd && (nd->flags & LOOKUP_OPEN))
-		oflags = nd->intent.open.flags;
+		oflags = nd->intent.open.file->f_flags;
 	else
-		oflags = FMODE_READ | SMB_O_CREAT;
+		oflags = O_RDONLY | O_CREAT;
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -359,9 +272,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		/* if the file is going to stay open, then we
 		   need to set the desired access properly */
 		desiredAccess = 0;
-		if (oflags & FMODE_READ)
+		if (OPEN_FMODE(oflags) & FMODE_READ)
 			desiredAccess |= GENERIC_READ; /* is this too little? */
-		if (oflags & FMODE_WRITE)
+		if (OPEN_FMODE(oflags) & FMODE_WRITE)
 			desiredAccess |= GENERIC_WRITE;
 
 		if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -716,11 +629,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	if (pTcon->unix_ext) {
 		if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
 		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
-		     (nd->intent.open.flags & O_CREAT)) {
+		     (nd->intent.open.file->f_flags & O_CREAT)) {
 			rc = cifs_posix_open(full_path, &newInode,
 					parent_dir_inode->i_sb,
 					nd->intent.open.create_mode,
-					nd->intent.open.flags, &oplock,
+					nd->intent.open.file->f_flags, &oplock,
 					&fileHandle, xid);
 			/*
 			 * The check below works around a bug in POSIX
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c10f0851cd21..b1ca6a43ac1a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -60,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
 		FILE_READ_DATA);
 }
 
-static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+static u32 cifs_posix_convert_flags(unsigned int flags)
 {
-	fmode_t posix_flags = 0;
+	u32 posix_flags = 0;
 
 	if ((flags & O_ACCMODE) == O_RDONLY)
-		posix_flags = FMODE_READ;
+		posix_flags = SMB_O_RDONLY;
 	else if ((flags & O_ACCMODE) == O_WRONLY)
-		posix_flags = FMODE_WRITE;
-	else if ((flags & O_ACCMODE) == O_RDWR) {
-		/* GENERIC_ALL is too much permission to request
-		   can cause unnecessary access denied on create */
-		/* return GENERIC_ALL; */
-		posix_flags = FMODE_READ | FMODE_WRITE;
-	}
-	/* can not map O_CREAT or O_EXCL or O_TRUNC flags when
-	   reopening a file.  They had their effect on the original open */
-	if (flags & O_APPEND)
-		posix_flags |= (fmode_t)O_APPEND;
+		posix_flags = SMB_O_WRONLY;
+	else if ((flags & O_ACCMODE) == O_RDWR)
+		posix_flags = SMB_O_RDWR;
+
+	if (flags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (flags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (flags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	/* be safe and imply O_SYNC for O_DSYNC */
 	if (flags & O_DSYNC)
-		posix_flags |= (fmode_t)O_DSYNC;
-	if (flags & __O_SYNC)
-		posix_flags |= (fmode_t)__O_SYNC;
+		posix_flags |= SMB_O_SYNC;
 	if (flags & O_DIRECTORY)
-		posix_flags |= (fmode_t)O_DIRECTORY;
+		posix_flags |= SMB_O_DIRECTORY;
 	if (flags & O_NOFOLLOW)
-		posix_flags |= (fmode_t)O_NOFOLLOW;
+		posix_flags |= SMB_O_NOFOLLOW;
 	if (flags & O_DIRECT)
-		posix_flags |= (fmode_t)O_DIRECT;
+		posix_flags |= SMB_O_DIRECT;
 
 	return posix_flags;
 }
@@ -159,6 +157,68 @@ client_can_cache:
 	return rc;
 }
 
+int cifs_posix_open(char *full_path, struct inode **pinode,
+			struct super_block *sb, int mode, unsigned int f_flags,
+			__u32 *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_fattr fattr;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
+
+	cFYI(1, "posix open %s", full_path);
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto posix_open_ret;
+	}
+
+	tcon = tlink_tcon(tlink);
+	mode &= ~current_umask();
+
+	posix_flags = cifs_posix_convert_flags(f_flags);
+	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
+			     poplock, full_path, cifs_sb->local_nls,
+			     cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+
+	/* get new inode and set it up */
+	if (*pinode == NULL) {
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode) {
+			rc = -ENOMEM;
+			goto posix_open_ret;
+		}
+	} else {
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
 int cifs_open(struct inode *inode, struct file *file)
 {
 	int rc = -EACCES;
@@ -205,12 +265,10 @@ int cifs_open(struct inode *inode, struct file *file)
 	    (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-		oflags |= SMB_O_CREAT;
 		/* can not refresh inode info since size could be stale */
 		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
 				cifs_sb->mnt_file_mode /* ignored */,
-				oflags, &oplock, &netfid, xid);
+				file->f_flags, &oplock, &netfid, xid);
 		if (rc == 0) {
 			cFYI(1, "posix open succeeded");
 
@@ -426,8 +484,13 @@ reopen_error_exit:
 	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-		/* can not refresh inode info since size could be stale */
+
+		/*
+		 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
+		 * original open. Must mask them off for a reopen.
+		 */
+		unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
+
 		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
 				cifs_sb->mnt_file_mode /* ignored */,
 				oflags, &oplock, &netfid, xid);
-- 
cgit v1.2.3


From f6a53460e2a105904deeada737b3f878d78517b3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:33:57 -0400
Subject: cifs: eliminate oflags option from cifs_new_fileinfo

Eliminate the poor, misunderstood "oflags" option from cifs_new_fileinfo.
The callers mostly pass in the filp->f_flags here.

That's not correct however since we're checking that value for
the presence of FMODE_READ. Luckily that only affects how the f_list is
ordered. What it really wants here is the file->f_mode. Just use that
field from the filp to determine it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 3 +--
 fs/cifs/dir.c       | 9 ++++-----
 fs/cifs/file.c      | 5 ++---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 4f7edbaf0118..781676e9cfc3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -107,8 +107,7 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 
 extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
-				struct tcon_link *tlink,
-				unsigned int oflags, __u32 oplock);
+				struct tcon_link *tlink, __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
 				int mode, unsigned int f_flags,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8c1af7128384..ce1fa3027b23 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -132,7 +132,7 @@ cifs_bp_rename_retry:
 
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
-		  struct tcon_link *tlink, unsigned int oflags, __u32 oplock)
+		  struct tcon_link *tlink, __u32 oplock)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct cifsFileInfo *pCifsFile;
@@ -161,7 +161,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 	pCifsInode = CIFS_I(newinode);
 	if (pCifsInode) {
 		/* if readable file instance put first in list*/
-		if (oflags & FMODE_READ)
+		if (file->f_mode & FMODE_READ)
 			list_add(&pCifsFile->flist, &pCifsInode->openFileList);
 		else
 			list_add_tail(&pCifsFile->flist,
@@ -396,7 +396,7 @@ cifs_create_set_dentry:
 		}
 
 		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-						tlink, oflags, oplock);
+						tlink, oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
 			CIFSSMBClose(xid, tcon, fileHandle);
@@ -670,8 +670,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 			}
 
 			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
-						  tlink, nd->intent.open.flags,
-						  oplock);
+						  tlink, oplock);
 			if (cfile == NULL) {
 				fput(filp);
 				CIFSSMBClose(xid, pTcon, fileHandle);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1ca6a43ac1a..774e3ac1208b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -273,7 +273,7 @@ int cifs_open(struct inode *inode, struct file *file)
 			cFYI(1, "posix open succeeded");
 
 			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
-							tlink, oflags, oplock);
+							tlink, oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
 				rc = -ENOMEM;
@@ -365,8 +365,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (rc != 0)
 		goto out;
 
-	pCifsFile = cifs_new_fileinfo(inode, netfid, file, tlink,
-					file->f_flags, oplock);
+	pCifsFile = cifs_new_fileinfo(inode, netfid, file, tlink, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
-- 
cgit v1.2.3


From abfe1eedd682ea0f20e7035445982e6d371a2024 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:33:58 -0400
Subject: cifs: eliminate the inode argument from cifs_new_fileinfo

It already takes a file pointer. The inode associated with that had damn
well better be the same one we're passing in anyway. Thus, there's no
need for a separate argument here.

Also, get rid of the bogus check for a null pCifsInode pointer. The
CIFS_I macro uses container_of(), and that will virtually never return a
NULL pointer anyway.

Finally, move the setting of the canCache* flags outside of the lock.
Other places in the code don't hold that lock when setting it, so I
assume it's not really needed here either.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  3 +--
 fs/cifs/dir.c       | 40 ++++++++++++++++++----------------------
 fs/cifs/file.c      |  6 +++---
 3 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 781676e9cfc3..ca6a0d9130cb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -105,8 +105,7 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 				      int offset);
 
-extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
-				__u16 fileHandle, struct file *file,
+extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 				struct tcon_link *tlink, __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ce1fa3027b23..36cf3e2ec252 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -131,12 +131,13 @@ cifs_bp_rename_retry:
 }
 
 struct cifsFileInfo *
-cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
+cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 		  struct tcon_link *tlink, __u32 oplock)
 {
 	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
 	struct cifsFileInfo *pCifsFile;
-	struct cifsInodeInfo *pCifsInode;
 
 	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
 	if (pCifsFile == NULL)
@@ -158,24 +159,20 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file,
 
 	write_lock(&GlobalSMBSeslock);
 	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
-	pCifsInode = CIFS_I(newinode);
-	if (pCifsInode) {
-		/* if readable file instance put first in list*/
-		if (file->f_mode & FMODE_READ)
-			list_add(&pCifsFile->flist, &pCifsInode->openFileList);
-		else
-			list_add_tail(&pCifsFile->flist,
-				      &pCifsInode->openFileList);
-
-		if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-			pCifsInode->clientCanCacheAll = true;
-			pCifsInode->clientCanCacheRead = true;
-			cFYI(1, "Exclusive Oplock inode %p", newinode);
-		} else if ((oplock & 0xF) == OPLOCK_READ)
-				pCifsInode->clientCanCacheRead = true;
-	}
+	/* if readable file instance put first in list*/
+	if (file->f_mode & FMODE_READ)
+		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+	else
+		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
 	write_unlock(&GlobalSMBSeslock);
 
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock inode %p", inode);
+	} else if ((oplock & 0xF) == OPLOCK_READ)
+		pCifsInode->clientCanCacheRead = true;
+
 	file->private_data = pCifsFile;
 
 	return pCifsFile;
@@ -395,8 +392,7 @@ cifs_create_set_dentry:
 			goto cifs_create_out;
 		}
 
-		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-						tlink, oplock);
+		pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
 			CIFSSMBClose(xid, tcon, fileHandle);
@@ -669,8 +665,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 				goto lookup_out;
 			}
 
-			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
-						  tlink, oplock);
+			cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
+						  oplock);
 			if (cfile == NULL) {
 				fput(filp);
 				CIFSSMBClose(xid, pTcon, fileHandle);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 774e3ac1208b..394cf28f080d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -272,8 +272,8 @@ int cifs_open(struct inode *inode, struct file *file)
 		if (rc == 0) {
 			cFYI(1, "posix open succeeded");
 
-			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
-							tlink, oplock);
+			pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
+						      oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
 				rc = -ENOMEM;
@@ -365,7 +365,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (rc != 0)
 		goto out;
 
-	pCifsFile = cifs_new_fileinfo(inode, netfid, file, tlink, oplock);
+	pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
-- 
cgit v1.2.3


From 15886177e412db00aa86155fe72608c4ebf5a08f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:33:59 -0400
Subject: cifs: clean up cifs_reopen_file

Add a f_flags field that holds the f_flags field from the filp. We'll
need this info in case the filp ever goes away before the cifsFileInfo
does. Have cifs_reopen_file use that value instead of filp->f_flags
too and have it take a cifsFileInfo arg instead of a filp.

While we're at it, get rid of some bogus cargo-cult NULL pointer
checks in that function and reduce the level of indentation.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |   1 +
 fs/cifs/dir.c      |   1 +
 fs/cifs/file.c     | 129 +++++++++++++++++++++++------------------------------
 3 files changed, 57 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6c69bd762498..b746dec88396 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -390,6 +390,7 @@ struct cifsFileInfo {
 	/* lock scope id (0 if none) */
 	struct file *pfile; /* needed for writepage */
 	struct dentry *dentry;
+	unsigned int f_flags;
 	struct tcon_link *tlink;
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 36cf3e2ec252..3c23d882a44c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -147,6 +147,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	pCifsFile->pid = current->tgid;
 	pCifsFile->uid = current_fsuid();
 	pCifsFile->dentry = dget(dentry);
+	pCifsFile->f_flags = file->f_flags;
 	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 394cf28f080d..dd922a5f9cae 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -412,14 +412,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 	return rc;
 }
 
-static int cifs_reopen_file(struct file *file, bool can_flush)
+static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 {
 	int rc = -EACCES;
 	int xid;
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *tcon;
-	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct inode *inode;
 	char *full_path = NULL;
@@ -427,11 +426,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	int disposition = FILE_OPEN;
 	__u16 netfid;
 
-	if (file->private_data)
-		pCifsFile = file->private_data;
-	else
-		return -EBADF;
-
 	xid = GetXid();
 	mutex_lock(&pCifsFile->fh_mutex);
 	if (!pCifsFile->invalidHandle) {
@@ -441,21 +435,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 		return rc;
 	}
 
-	if (file->f_path.dentry == NULL) {
-		cERROR(1, "no valid name if dentry freed");
-		dump_stack();
-		rc = -EBADF;
-		goto reopen_error_exit;
-	}
-
-	inode = file->f_path.dentry->d_inode;
-	if (inode == NULL) {
-		cERROR(1, "inode not valid");
-		dump_stack();
-		rc = -EBADF;
-		goto reopen_error_exit;
-	}
-
+	inode = pCifsFile->dentry->d_inode;
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tcon = tlink_tcon(pCifsFile->tlink);
 
@@ -463,17 +443,16 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
    those that already have the rename sem can end up causing writepage
    to get called and if the server was down that means we end up here,
    and we can never tell if the caller already has the rename_sem */
-	full_path = build_path_from_dentry(file->f_path.dentry);
+	full_path = build_path_from_dentry(pCifsFile->dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-reopen_error_exit:
 		mutex_unlock(&pCifsFile->fh_mutex);
 		FreeXid(xid);
 		return rc;
 	}
 
 	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-		 inode, file->f_flags, full_path);
+		 inode, pCifsFile->f_flags, full_path);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -488,7 +467,8 @@ reopen_error_exit:
 		 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
 		 * original open. Must mask them off for a reopen.
 		 */
-		unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
+		unsigned int oflags = pCifsFile->f_flags &
+						~(O_CREAT | O_EXCL | O_TRUNC);
 
 		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
 				cifs_sb->mnt_file_mode /* ignored */,
@@ -501,7 +481,7 @@ reopen_error_exit:
 		   in the reconnect path it is important to retry hard */
 	}
 
-	desiredAccess = cifs_convert_flags(file->f_flags);
+	desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
 
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
@@ -517,49 +497,50 @@ reopen_error_exit:
 		mutex_unlock(&pCifsFile->fh_mutex);
 		cFYI(1, "cifs_open returned 0x%x", rc);
 		cFYI(1, "oplock: %d", oplock);
-	} else {
+		goto reopen_error_exit;
+	}
+
 reopen_success:
-		pCifsFile->netfid = netfid;
-		pCifsFile->invalidHandle = false;
-		mutex_unlock(&pCifsFile->fh_mutex);
-		pCifsInode = CIFS_I(inode);
-		if (pCifsInode) {
-			if (can_flush) {
-				rc = filemap_write_and_wait(inode->i_mapping);
-				if (rc != 0)
-					CIFS_I(inode)->write_behind_rc = rc;
-			/* temporarily disable caching while we
-			   go to server to get inode info */
-				pCifsInode->clientCanCacheAll = false;
-				pCifsInode->clientCanCacheRead = false;
-				if (tcon->unix_ext)
-					rc = cifs_get_inode_info_unix(&inode,
-						full_path, inode->i_sb, xid);
-				else
-					rc = cifs_get_inode_info(&inode,
-						full_path, NULL, inode->i_sb,
-						xid, NULL);
-			} /* else we are writing out data to server already
-			     and could deadlock if we tried to flush data, and
-			     since we do not know if we have data that would
-			     invalidate the current end of file on the server
-			     we can not go to the server to get the new inod
-			     info */
-			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-				pCifsInode->clientCanCacheAll = true;
-				pCifsInode->clientCanCacheRead = true;
-				cFYI(1, "Exclusive Oplock granted on inode %p",
-					 file->f_path.dentry->d_inode);
-			} else if ((oplock & 0xF) == OPLOCK_READ) {
-				pCifsInode->clientCanCacheRead = true;
-				pCifsInode->clientCanCacheAll = false;
-			} else {
-				pCifsInode->clientCanCacheRead = false;
-				pCifsInode->clientCanCacheAll = false;
-			}
-			cifs_relock_file(pCifsFile);
-		}
+	pCifsFile->netfid = netfid;
+	pCifsFile->invalidHandle = false;
+	mutex_unlock(&pCifsFile->fh_mutex);
+	pCifsInode = CIFS_I(inode);
+
+	if (can_flush) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		if (rc != 0)
+			CIFS_I(inode)->write_behind_rc = rc;
+
+		pCifsInode->clientCanCacheAll = false;
+		pCifsInode->clientCanCacheRead = false;
+		if (tcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&inode,
+				full_path, inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&inode,
+				full_path, NULL, inode->i_sb,
+				xid, NULL);
+	} /* else we are writing out data to server already
+	     and could deadlock if we tried to flush data, and
+	     since we do not know if we have data that would
+	     invalidate the current end of file on the server
+	     we can not go to the server to get the new inod
+	     info */
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+			 pCifsFile->dentry->d_inode);
+	} else if ((oplock & 0xF) == OPLOCK_READ) {
+		pCifsInode->clientCanCacheRead = true;
+		pCifsInode->clientCanCacheAll = false;
+	} else {
+		pCifsInode->clientCanCacheRead = false;
+		pCifsInode->clientCanCacheAll = false;
 	}
+	cifs_relock_file(pCifsFile);
+
+reopen_error_exit:
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -998,7 +979,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to server
 				   now */
-				rc = cifs_reopen_file(file, false);
+				rc = cifs_reopen_file(open_file, false);
 				if (rc != 0)
 					break;
 			}
@@ -1096,7 +1077,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to
 				   server now */
-				rc = cifs_reopen_file(file, false);
+				rc = cifs_reopen_file(open_file, false);
 				if (rc != 0)
 					break;
 			}
@@ -1244,7 +1225,7 @@ refind_writable:
 
 			read_unlock(&GlobalSMBSeslock);
 			/* Had to unlock since following call can block */
-			rc = cifs_reopen_file(open_file->pfile, false);
+			rc = cifs_reopen_file(open_file, false);
 			if (!rc) {
 				if (!open_file->closePend)
 					return open_file;
@@ -1792,7 +1773,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 			int buf_type = CIFS_NO_BUFFER;
 			if ((open_file->invalidHandle) &&
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, true);
+				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
 			}
@@ -1878,7 +1859,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) &&
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, true);
+				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
 			}
@@ -2043,7 +2024,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) &&
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, true);
+				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
 			}
-- 
cgit v1.2.3


From 7da4b49a0ec481239d9cd51f7053aa8e4e191fa7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:00 -0400
Subject: cifs: cifs_write argument change and cleanup

Have cifs_write take a cifsFileInfo pointer instead of a filp. Since
cifsFileInfo holds references on the dentry, and that holds one to
the inode, we can eliminate some unneeded NULL pointer checks.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 51 +++++++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index dd922a5f9cae..d5f3007364e3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1027,8 +1027,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	return total_written;
 }
 
-static ssize_t cifs_write(struct file *file, const char *write_data,
-			  size_t write_size, loff_t *poffset)
+static ssize_t cifs_write(struct cifsFileInfo *open_file,
+			  const char *write_data, size_t write_size,
+			  loff_t *poffset)
 {
 	int rc = 0;
 	unsigned int bytes_written = 0;
@@ -1036,17 +1037,14 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	int xid, long_op;
-	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+	struct dentry *dentry = open_file->dentry;
+	struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	cifs_sb = CIFS_SB(dentry->d_sb);
 
 	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name);
+	   *poffset, dentry->d_name.name);
 
-	if (file->private_data == NULL)
-		return -EBADF;
-	open_file = file->private_data;
 	pTcon = tlink_tcon(open_file->tlink);
 
 	xid = GetXid();
@@ -1056,15 +1054,6 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if (file->private_data == NULL) {
-				/* file has been closed on us */
-				FreeXid(xid);
-			/* if we have gotten here we have written some data
-			   and blocked, and the file has been freed on us
-			   while we blocked so return what we managed to
-			   write */
-				return total_written;
-			}
 			if (open_file->closePend) {
 				FreeXid(xid);
 				if (total_written)
@@ -1124,20 +1113,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	cifs_stats_bytes_written(pTcon, total_written);
 
-	/* since the write may have blocked check these pointers again */
-	if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-/*BB We could make this contingent on superblock ATIME flag too */
-/*		file->f_path.dentry->d_inode->i_ctime =
-		file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/
-		if (total_written > 0) {
-			spin_lock(&file->f_path.dentry->d_inode->i_lock);
-			if (*poffset > file->f_path.dentry->d_inode->i_size)
-				i_size_write(file->f_path.dentry->d_inode,
-					     *poffset);
-			spin_unlock(&file->f_path.dentry->d_inode->i_lock);
-		}
-		mark_inode_dirty_sync(file->f_path.dentry->d_inode);
+	if (total_written > 0) {
+		spin_lock(&dentry->d_inode->i_lock);
+		if (*poffset > dentry->d_inode->i_size)
+			i_size_write(dentry->d_inode, *poffset);
+		spin_unlock(&dentry->d_inode->i_lock);
 	}
+	mark_inode_dirty_sync(dentry->d_inode);
 	FreeXid(xid);
 	return total_written;
 }
@@ -1308,8 +1290,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 
 	open_file = find_writable_file(CIFS_I(mapping->host), false);
 	if (open_file) {
-		bytes_written = cifs_write(open_file->pfile, write_data,
-					   to-from, &offset);
+		bytes_written = cifs_write(open_file, write_data,
+					   to - from, &offset);
 		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
 		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1624,7 +1606,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 		/* BB check if anything else missing out of ppw
 		   such as updating last write time */
 		page_data = kmap(page);
-		rc = cifs_write(file, page_data + offset, copied, &pos);
+		rc = cifs_write(file->private_data, page_data + offset,
+				copied, &pos);
 		/* if (rc < 0) should we set writebehind rc? */
 		kunmap(page);
 
-- 
cgit v1.2.3


From 2e396b83f6087b78dac5a18d6d0cf9f8426a00b3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:01 -0400
Subject: cifs: eliminate pfile pointer from cifsFileInfo

All the remaining users of cifsFileInfo->pfile just use it to get
at the f_flags/f_mode. Now that we store that separately in the
cifsFileInfo, there's no need to consult the pfile at all from
a cifsFileInfo pointer.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |  1 -
 fs/cifs/dir.c      |  1 -
 fs/cifs/file.c     | 11 +++--------
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b746dec88396..9f99afab9a04 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -388,7 +388,6 @@ struct cifsFileInfo {
 	__u16 netfid;		/* file id from remote */
 	/* BB add lock scope info here if needed */ ;
 	/* lock scope id (0 if none) */
-	struct file *pfile; /* needed for writepage */
 	struct dentry *dentry;
 	unsigned int f_flags;
 	struct tcon_link *tlink;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3c23d882a44c..600eac18cb21 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -148,7 +148,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	pCifsFile->uid = current_fsuid();
 	pCifsFile->dentry = dget(dentry);
 	pCifsFile->f_flags = file->f_flags;
-	pCifsFile->pfile = file;
 	pCifsFile->invalidHandle = false;
 	pCifsFile->closePend = false;
 	pCifsFile->tlink = cifs_get_tlink(tlink);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d5f3007364e3..7935816fa111 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1144,8 +1144,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 			continue;
 		if (fsuid_only && open_file->uid != current_fsuid())
 			continue;
-		if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
-		    (open_file->pfile->f_flags & O_RDONLY))) {
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
 			if (!open_file->invalidHandle) {
 				/* found a good file */
 				/* lock it so it will not be closed on us */
@@ -1194,9 +1193,7 @@ refind_writable:
 			continue;
 		if (fsuid_only && open_file->uid != current_fsuid())
 			continue;
-		if (open_file->pfile &&
-		    ((open_file->pfile->f_flags & O_RDWR) ||
-		     (open_file->pfile->f_flags & O_WRONLY))) {
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
 			cifsFileInfo_get(open_file);
 
 			if (!open_file->invalidHandle) {
@@ -2160,9 +2157,7 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (open_file->closePend)
 			continue;
-		if (open_file->pfile &&
-		    ((open_file->pfile->f_flags & O_RDWR) ||
-		     (open_file->pfile->f_flags & O_WRONLY))) {
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
 			read_unlock(&GlobalSMBSeslock);
 			return 1;
 		}
-- 
cgit v1.2.3


From 15ecb436c00fcb13b6dc32bdcbb9f75fc9b7613e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:02 -0400
Subject: cifs: move cifs_new_fileinfo to file.c

It's currently in dir.c which makes little sense...

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c  | 48 ------------------------------------------------
 fs/cifs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 600eac18cb21..3840eddbfb7a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,54 +130,6 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
-struct cifsFileInfo *
-cifs_new_fileinfo(__u16 fileHandle, struct file *file,
-		  struct tcon_link *tlink, __u32 oplock)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-	struct cifsFileInfo *pCifsFile;
-
-	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-	if (pCifsFile == NULL)
-		return pCifsFile;
-
-	pCifsFile->netfid = fileHandle;
-	pCifsFile->pid = current->tgid;
-	pCifsFile->uid = current_fsuid();
-	pCifsFile->dentry = dget(dentry);
-	pCifsFile->f_flags = file->f_flags;
-	pCifsFile->invalidHandle = false;
-	pCifsFile->closePend = false;
-	pCifsFile->tlink = cifs_get_tlink(tlink);
-	mutex_init(&pCifsFile->fh_mutex);
-	mutex_init(&pCifsFile->lock_mutex);
-	INIT_LIST_HEAD(&pCifsFile->llist);
-	atomic_set(&pCifsFile->count, 1);
-	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
-
-	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
-	/* if readable file instance put first in list*/
-	if (file->f_mode & FMODE_READ)
-		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
-	else
-		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
-	write_unlock(&GlobalSMBSeslock);
-
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = true;
-		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock inode %p", inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ)
-		pCifsInode->clientCanCacheRead = true;
-
-	file->private_data = pCifsFile;
-
-	return pCifsFile;
-}
-
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7935816fa111..293e9b767621 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -219,6 +219,53 @@ posix_open_ret:
 	return rc;
 }
 
+struct cifsFileInfo *
+cifs_new_fileinfo(__u16 fileHandle, struct file *file,
+		  struct tcon_link *tlink, __u32 oplock)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
+	struct cifsFileInfo *pCifsFile;
+
+	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (pCifsFile == NULL)
+		return pCifsFile;
+
+	pCifsFile->netfid = fileHandle;
+	pCifsFile->pid = current->tgid;
+	pCifsFile->uid = current_fsuid();
+	pCifsFile->dentry = dget(dentry);
+	pCifsFile->f_flags = file->f_flags;
+	pCifsFile->invalidHandle = false;
+	pCifsFile->closePend = false;
+	pCifsFile->tlink = cifs_get_tlink(tlink);
+	mutex_init(&pCifsFile->fh_mutex);
+	mutex_init(&pCifsFile->lock_mutex);
+	INIT_LIST_HEAD(&pCifsFile->llist);
+	atomic_set(&pCifsFile->count, 1);
+	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+
+	write_lock(&GlobalSMBSeslock);
+	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
+	/* if readable file instance put first in list*/
+	if (file->f_mode & FMODE_READ)
+		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+	else
+		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
+	write_unlock(&GlobalSMBSeslock);
+
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock inode %p", inode);
+	} else if ((oplock & 0xF) == OPLOCK_READ)
+		pCifsInode->clientCanCacheRead = true;
+
+	file->private_data = pCifsFile;
+	return pCifsFile;
+}
+
 int cifs_open(struct inode *inode, struct file *file)
 {
 	int rc = -EACCES;
-- 
cgit v1.2.3


From 7a16f1961a5c61d1f60d9e0d3d171cf7793fb5cb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 18 Oct 2010 01:09:48 +0000
Subject: [CIFS] Fix minor checkpatch warning and update cifs version

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h    | 2 +-
 fs/cifs/cifsproto.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 85544bcab517..f35795a16b42 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -112,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.65"
+#define CIFS_VERSION   "1.67"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ca6a0d9130cb..e593c40ba7ba 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -105,8 +105,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 				      int offset);
 
-extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file,
-				struct tcon_link *tlink, __u32 oplock);
+extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
+				struct file *file, struct tcon_link *tlink,
+				__u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
 				int mode, unsigned int f_flags,
-- 
cgit v1.2.3


From 4477288a103631980750c86547d1fd54bfd2ba7d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:03 -0400
Subject: cifs: convert GlobalSMBSeslock from a rwlock to regular spinlock

Convert this lock to a regular spinlock

A rwlock_t offers little value here. It's more expensive than a regular
spinlock unless you have a fairly large section of code that runs under
the read lock and can benefit from the concurrency.

Additionally, we need to ensure that the refcounting for files isn't
racy and to do that we need to lock areas that can increment it for
write. That means that the areas that can actually use a read_lock are
very few and relatively infrequently used.

While we're at it, change the name to something easier to type, and fix
a bug in find_writable_file. cifsFileInfo_put can sleep and shouldn't be
called while holding the lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |  2 +-
 fs/cifs/cifsglob.h |  2 +-
 fs/cifs/cifssmb.c  |  4 ++--
 fs/cifs/file.c     | 54 +++++++++++++++++++++++++++---------------------------
 fs/cifs/misc.c     |  8 ++++----
 fs/cifs/readdir.c  |  6 +++---
 6 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index cbd468c880c4..f1d9c71e807f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -940,8 +940,8 @@ init_cifs(void)
 	GlobalTotalActiveXid = 0;
 	GlobalMaxActiveXid = 0;
 	memset(Local_System_Name, 0, 15);
-	rwlock_init(&GlobalSMBSeslock);
 	rwlock_init(&cifs_tcp_ses_lock);
+	spin_lock_init(&cifs_file_list_lock);
 	spin_lock_init(&GlobalMid_Lock);
 
 	if (cifs_max_pending < 2) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9f99afab9a04..53899a8d7c4a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -720,7 +720,7 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
  * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
  * the cifs_tcp_ses_lock must be grabbed first and released last.
  */
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
+GLOBAL_EXTERN spinlock_t	cifs_file_list_lock;
 
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a420c7b08522..bfb59a68e4fd 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 	struct list_head *tmp1;
 
 /* list all files open on tree connection and mark them invalid */
-	write_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
 		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
 		open_file->invalidHandle = true;
 		open_file->oplock_break_cancelled = true;
 	}
-	write_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
 	   to this tcon */
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 293e9b767621..26048dc9069a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -246,14 +246,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	atomic_set(&pCifsFile->count, 1);
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
 
-	write_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
 	/* if readable file instance put first in list*/
 	if (file->f_mode & FMODE_READ)
 		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
 	else
 		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
-	write_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 
 	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
@@ -607,13 +607,13 @@ int cifs_close(struct inode *inode, struct file *file)
 	pTcon = tlink_tcon(pSMBFile->tlink);
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
-		write_lock(&GlobalSMBSeslock);
+		spin_lock(&cifs_file_list_lock);
 		pSMBFile->closePend = true;
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
 			if (!pTcon->need_reconnect) {
-				write_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 				timeout = 2;
 				while ((atomic_read(&pSMBFile->count) != 1)
 					&& (timeout <= 2048)) {
@@ -633,9 +633,9 @@ int cifs_close(struct inode *inode, struct file *file)
 					rc = CIFSSMBClose(xid, pTcon,
 						  pSMBFile->netfid);
 			} else
-				write_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 		} else
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 
 		/* Delete any outstanding lock records.
 		   We'll lose them when the file is closed anyway. */
@@ -646,16 +646,16 @@ int cifs_close(struct inode *inode, struct file *file)
 		}
 		mutex_unlock(&pSMBFile->lock_mutex);
 
-		write_lock(&GlobalSMBSeslock);
+		spin_lock(&cifs_file_list_lock);
 		list_del(&pSMBFile->flist);
 		list_del(&pSMBFile->tlist);
-		write_unlock(&GlobalSMBSeslock);
+		spin_unlock(&cifs_file_list_lock);
 		cifsFileInfo_put(file->private_data);
 		file->private_data = NULL;
 	} else
 		rc = -EBADF;
 
-	read_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	if (list_empty(&(CIFS_I(inode)->openFileList))) {
 		cFYI(1, "closing last open instance for inode %p", inode);
 		/* if the file is not open we do not know if we can cache info
@@ -663,7 +663,7 @@ int cifs_close(struct inode *inode, struct file *file)
 		CIFS_I(inode)->clientCanCacheRead = false;
 		CIFS_I(inode)->clientCanCacheAll  = false;
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
 		rc = CIFS_I(inode)->write_behind_rc;
 	FreeXid(xid);
@@ -685,18 +685,18 @@ int cifs_closedir(struct inode *inode, struct file *file)
 		struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
 
 		cFYI(1, "Freeing private data in close dir");
-		write_lock(&GlobalSMBSeslock);
+		spin_lock(&cifs_file_list_lock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
 			cFYI(1, "Closing uncompleted readdir with rc %d",
 				 rc);
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
 		} else
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
 			cFYI(1, "closedir free smb buf in srch struct");
@@ -1182,7 +1182,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
 		fsuid_only = false;
 
-	read_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	/* we could simply get the first_list_entry since write-only entries
 	   are always at the end of the list but since the first entry might
 	   have a close pending, we go through the whole list */
@@ -1196,7 +1196,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 				/* found a good file */
 				/* lock it so it will not be closed on us */
 				cifsFileInfo_get(open_file);
-				read_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 				return open_file;
 			} /* else might as well continue, and look for
 			     another, or simply have the caller reopen it
@@ -1204,7 +1204,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 		} else /* write only file */
 			break; /* write only files are last so must be done */
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	return NULL;
 }
 #endif
@@ -1231,7 +1231,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
 		fsuid_only = false;
 
-	read_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (open_file->closePend)
@@ -1245,11 +1245,11 @@ refind_writable:
 
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
-				read_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 				return open_file;
 			}
 
-			read_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			/* Had to unlock since following call can block */
 			rc = cifs_reopen_file(open_file, false);
 			if (!rc) {
@@ -1257,7 +1257,7 @@ refind_writable:
 					return open_file;
 				else { /* start over in case this was deleted */
 				       /* since the list could be modified */
-					read_lock(&GlobalSMBSeslock);
+					spin_lock(&cifs_file_list_lock);
 					cifsFileInfo_put(open_file);
 					goto refind_writable;
 				}
@@ -1271,7 +1271,7 @@ refind_writable:
 			to hold up writepages here (rather than
 			in caller) with continuous retries */
 			cFYI(1, "wp failed on reopen file");
-			read_lock(&GlobalSMBSeslock);
+			spin_lock(&cifs_file_list_lock);
 			/* can not use this handle, no write
 			   pending on this one after all */
 			cifsFileInfo_put(open_file);
@@ -1292,7 +1292,7 @@ refind_writable:
 		any_available = true;
 		goto refind_writable;
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	return NULL;
 }
 
@@ -2200,16 +2200,16 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 {
 	struct cifsFileInfo *open_file;
 
-	read_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (open_file->closePend)
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-			read_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			return 1;
 		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	return 0;
 }
 
@@ -2373,8 +2373,8 @@ void cifs_oplock_break(struct work_struct *work)
 	 * finished grabbing reference for us.  Make sure it's done by
 	 * waiting for GlobalSMSSeslock.
 	 */
-	write_lock(&GlobalSMBSeslock);
-	write_unlock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
+	spin_unlock(&cifs_file_list_lock);
 
 	cifs_oplock_break_put(cfile);
 }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 9bac3e74b314..de6073cccd9c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -560,7 +560,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				continue;
 
 			cifs_stats_inc(&tcon->num_oplock_brks);
-			read_lock(&GlobalSMBSeslock);
+			spin_lock(&cifs_file_list_lock);
 			list_for_each(tmp2, &tcon->openFileList) {
 				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
@@ -572,7 +572,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				 * closed anyway.
 				 */
 				if (netfile->closePend) {
-					read_unlock(&GlobalSMBSeslock);
+					spin_unlock(&cifs_file_list_lock);
 					read_unlock(&cifs_tcp_ses_lock);
 					return true;
 				}
@@ -594,11 +594,11 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 					cifs_oplock_break_get(netfile);
 				netfile->oplock_break_cancelled = false;
 
-				read_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 				read_unlock(&cifs_tcp_ses_lock);
 				return true;
 			}
-			read_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			read_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, "No matching file for oplock break");
 			return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 6f3d13ff9470..ef7bb7b50f58 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -528,14 +528,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, "search backing up - close and restart search");
-		write_lock(&GlobalSMBSeslock);
+		spin_lock(&cifs_file_list_lock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
 			cifsFile->invalidHandle = true;
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		} else
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, "freeing SMB ff cache buf on search rewind");
 			if (cifsFile->srch_inf.smallBuf)
-- 
cgit v1.2.3


From b33879aa834ebe03ced3dca4e3b822bd8894a474 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:04 -0400
Subject: cifs: move cifsFileInfo_put to file.c

...and make it non-inlined in preparation for the move of most of
cifs_close to it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 10 +---------
 fs/cifs/file.c     | 10 ++++++++++
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 53899a8d7c4a..18ee0adda306 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -408,15 +408,7 @@ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 	atomic_inc(&cifs_file->count);
 }
 
-/* Release a reference on the file private data */
-static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
-{
-	if (atomic_dec_and_test(&cifs_file->count)) {
-		cifs_put_tlink(cifs_file->tlink);
-		dput(cifs_file->dentry);
-		kfree(cifs_file);
-	}
-}
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 
 /*
  * One of these for each file inode
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 26048dc9069a..a3634e43bd4f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -266,6 +266,16 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	return pCifsFile;
 }
 
+/* Release a reference on the file private data */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	if (atomic_dec_and_test(&cifs_file->count)) {
+		cifs_put_tlink(cifs_file->tlink);
+		dput(cifs_file->dentry);
+		kfree(cifs_file);
+	}
+}
+
 int cifs_open(struct inode *inode, struct file *file)
 {
 	int rc = -EACCES;
-- 
cgit v1.2.3


From 631dd1a885b6d7e9f6f51b4e5b311c2bb04c323c Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Mon, 18 Oct 2010 11:03:14 +0200
Subject: Update broken web addresses in the kernel.

The patch below updates broken web addresses in the kernel

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Finn Thain <fthain@telegraphics.com.au>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Dimitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Ben Pfaff <blp@cs.stanford.edu>
Acked-by: Hans J. Koch <hjk@linutronix.de>
Reviewed-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/ata/pata_it821x.c                       |  4 ++--
 drivers/atm/Kconfig                             |  2 +-
 drivers/char/agp/Kconfig                        |  2 +-
 drivers/char/agp/i460-agp.c                     |  2 +-
 drivers/char/apm-emulation.c                    |  4 ++--
 drivers/char/ipmi/ipmi_bt_sm.c                  |  2 +-
 drivers/char/ipmi/ipmi_si_intf.c                |  3 +--
 drivers/char/n_r3964.c                          |  1 -
 drivers/char/pcmcia/Kconfig                     |  4 ++--
 drivers/char/tpm/Kconfig                        |  2 +-
 drivers/char/tpm/tpm_infineon.c                 |  2 +-
 drivers/edac/edac_device_sysfs.c                |  2 +-
 drivers/edac/i82443bxgx_edac.c                  |  2 +-
 drivers/firmware/Kconfig                        |  3 ++-
 drivers/firmware/edd.c                          |  2 +-
 drivers/firmware/pcdp.h                         |  4 ++--
 drivers/gpu/drm/drm_modes.c                     |  2 +-
 drivers/hwmon/adm1025.c                         |  2 +-
 drivers/hwmon/adm1026.c                         |  2 +-
 drivers/hwmon/f75375s.c                         |  4 ++--
 drivers/hwmon/g760a.c                           |  2 +-
 drivers/hwmon/hwmon-vid.c                       |  2 +-
 drivers/ide/hpt366.c                            |  2 +-
 drivers/ide/ht6560b.c                           |  1 -
 drivers/infiniband/Kconfig                      |  4 ++--
 drivers/infiniband/hw/cxgb3/Kconfig             |  2 +-
 drivers/infiniband/hw/cxgb4/Kconfig             |  2 +-
 drivers/infiniband/ulp/iser/Kconfig             |  2 +-
 drivers/input/joystick/gamecon.c                |  3 +--
 drivers/input/misc/cm109.c                      |  2 +-
 drivers/input/mouse/Kconfig                     |  1 +
 drivers/input/mouse/touchkit_ps2.c              |  4 ++--
 drivers/input/touchscreen/mk712.c               |  2 +-
 drivers/isdn/i4l/isdn_audio.c                   |  2 +-
 drivers/macintosh/therm_adt746x.c               |  6 +++---
 drivers/media/IR/keymaps/rc-manli.c             |  1 -
 drivers/media/dvb/ttpci/av7110.c                |  9 ++-------
 drivers/media/dvb/ttpci/av7110_av.c             |  2 +-
 drivers/media/dvb/ttpci/av7110_ca.c             |  2 +-
 drivers/media/dvb/ttpci/av7110_hw.c             |  2 +-
 drivers/media/dvb/ttpci/av7110_v4l.c            |  2 +-
 drivers/media/dvb/ttpci/budget-av.c             |  2 +-
 drivers/media/dvb/ttpci/budget-ci.c             |  2 +-
 drivers/media/dvb/ttpci/budget-core.c           |  2 +-
 drivers/media/dvb/ttpci/budget-patch.c          |  2 +-
 drivers/media/dvb/ttpci/budget.c                |  2 +-
 drivers/media/radio/radio-maxiradio.c           |  2 +-
 drivers/media/radio/radio-typhoon.c             |  3 ---
 drivers/media/video/Kconfig                     |  2 +-
 drivers/media/video/cafe_ccic.c                 |  2 +-
 drivers/media/video/cx18/cx18-cards.c           |  2 +-
 drivers/media/video/cx23885/cx23885-417.c       |  2 +-
 drivers/media/video/cx88/cx88-blackbird.c       |  2 +-
 drivers/media/video/ivtv/ivtv-cards.c           |  2 +-
 drivers/media/video/mxb.c                       |  2 +-
 drivers/media/video/sn9c102/sn9c102_pas202bcb.c |  1 -
 drivers/misc/Kconfig                            |  4 ++--
 drivers/mtd/chips/cfi_cmdset_0002.c             |  4 ++--
 drivers/mtd/devices/lart.c                      |  2 +-
 drivers/mtd/ftl.c                               |  2 +-
 drivers/mtd/maps/Kconfig                        |  4 ++--
 drivers/mtd/nand/cafe_nand.c                    |  2 +-
 drivers/net/Kconfig                             | 21 ++++++++++-----------
 drivers/net/appletalk/Kconfig                   |  2 +-
 drivers/net/atp.c                               |  2 +-
 drivers/net/epic100.c                           |  4 ++--
 drivers/net/hamradio/Kconfig                    |  2 +-
 drivers/net/ibmlana.c                           |  2 +-
 drivers/net/irda/donauboe.h                     |  2 +-
 drivers/net/pci-skeleton.c                      |  2 +-
 drivers/net/pcmcia/3c574_cs.c                   |  2 +-
 drivers/net/sc92031.c                           |  2 +-
 drivers/net/tlan.c                              |  2 +-
 drivers/net/tokenring/tms380tr.c                |  2 +-
 drivers/net/tulip/Kconfig                       |  2 +-
 drivers/net/usb/plusb.c                         |  2 +-
 drivers/net/wan/Kconfig                         |  2 +-
 drivers/net/wireless/ath/ath5k/ath5k.h          |  2 +-
 drivers/net/wireless/ath/ath5k/reg.h            |  1 -
 drivers/net/wireless/p54/Kconfig                |  6 +++---
 drivers/net/wireless/prism54/islpci_hotplug.c   |  2 +-
 drivers/parisc/README.dino                      |  3 +--
 drivers/pci/quirks.c                            |  3 ++-
 drivers/pcmcia/yenta_socket.c                   |  2 +-
 drivers/pnp/pnpbios/proc.c                      |  1 -
 drivers/scsi/Kconfig                            |  9 +++++----
 drivers/serial/8250.c                           |  2 +-
 drivers/serial/bfin_sport_uart.c                |  2 +-
 drivers/serial/bfin_sport_uart.h                |  2 +-
 drivers/serial/uartlite.c                       |  2 +-
 drivers/staging/asus_oled/README                |  2 +-
 drivers/staging/asus_oled/asus_oled.c           |  2 +-
 drivers/staging/comedi/drivers/cb_pcimdas.c     |  2 +-
 drivers/staging/comedi/drivers/daqboard2000.c   |  4 ++--
 drivers/staging/comedi/drivers/ni_labpc.c       |  2 +-
 drivers/staging/comedi/drivers/ni_mio_common.c  |  2 +-
 drivers/staging/comedi/drivers/plx9080.h        |  2 +-
 drivers/staging/comedi/drivers/rtd520.c         |  2 +-
 drivers/staging/quickstart/quickstart.c         |  3 +--
 drivers/uio/Kconfig                             |  6 +++---
 drivers/usb/serial/Kconfig                      |  4 ++--
 drivers/usb/serial/ftdi_sio_ids.h               | 12 ++++++------
 drivers/usb/serial/keyspan.c                    |  2 +-
 drivers/usb/serial/keyspan.h                    |  2 +-
 drivers/usb/serial/mct_u232.h                   |  9 ++++-----
 drivers/usb/storage/Kconfig                     |  2 +-
 drivers/video/Kconfig                           | 10 +++++-----
 drivers/video/arcfb.c                           |  1 -
 drivers/video/epson1355fb.c                     |  2 +-
 drivers/video/fbcvt.c                           |  2 +-
 drivers/video/metronomefb.c                     |  2 +-
 firmware/keyspan_pda/keyspan_pda.S              |  2 +-
 firmware/keyspan_pda/xircom_pgs.S               |  2 +-
 fs/hostfs/hostfs.h                              |  7 +------
 fs/partitions/ldm.c                             |  2 +-
 fs/partitions/ldm.h                             |  2 +-
 fs/reiserfs/Kconfig                             |  6 ++++--
 fs/reiserfs/README                              |  2 +-
 include/crypto/gf128mul.h                       |  4 ++--
 include/linux/fdreg.h                           |  2 +-
 include/linux/if_infiniband.h                   |  2 +-
 include/linux/n_r3964.h                         |  1 -
 net/ax25/Kconfig                                |  8 ++++----
 net/ipv4/Kconfig                                |  4 ++--
 net/ipv4/cipso_ipv4.c                           |  2 +-
 net/ipv4/fib_trie.c                             |  2 +-
 net/ipv4/netfilter/Kconfig                      |  2 +-
 net/ipv4/tcp_illinois.c                         |  2 +-
 net/ipv4/tcp_input.c                            |  4 ++--
 net/ipv4/tcp_veno.c                             |  2 +-
 net/netfilter/nf_conntrack_proto_tcp.c          |  4 ++--
 sound/oss/ac97_codec.c                          |  7 ++-----
 sound/pci/ens1370.c                             |  2 +-
 sound/pci/intel8x0.c                            |  2 +-
 134 files changed, 183 insertions(+), 207 deletions(-)

(limited to 'fs')

diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c
index bf88f71a21f4..aa0e0c51cc08 100644
--- a/drivers/ata/pata_it821x.c
+++ b/drivers/ata/pata_it821x.c
@@ -15,8 +15,8 @@
  *  May be copied or modified under the terms of the GNU General Public License
  *  Based in part on the ITE vendor provided SCSI driver.
  *
- *  Documentation available from
- * 	http://www.ite.com.tw/pc/IT8212F_V04.pdf
+ *  Documentation available from IT8212F_V04.pdf
+ * 	http://www.ite.com.tw/EN/products_more.aspx?CategoryID=3&ID=5,91
  *  Some other documents are NDA.
  *
  *  The ITE8212 isn't exactly a standard IDE controller. It has two
diff --git a/drivers/atm/Kconfig b/drivers/atm/Kconfig
index be7461c9a87e..31c60101a69a 100644
--- a/drivers/atm/Kconfig
+++ b/drivers/atm/Kconfig
@@ -301,7 +301,7 @@ config ATM_IA
 	  control memory (128K-1KVC, 512K-4KVC), the size of the packet
 	  memory (128K, 512K, 1M), and the PHY type (Single/Multi mode OC3,
 	  UTP155, UTP25, DS3 and E3). Go to:
-	  	<http://www.iphase.com/products/ClassSheet.cfm?ClassID=ATM>
+	  	<http://www.iphase.com/>
 	  for more info about the cards. Say Y (or M to compile as a module
 	  named iphase) here if you have one of these cards.
 
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 4b66c69eaf57..c8ad61958e24 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -34,7 +34,7 @@ config AGP_ALI
 	  X on the following ALi chipsets.  The supported chipsets
 	  include M1541, M1621, M1631, M1632, M1641,M1647,and M1651.
 	  For the ALi-chipset question, ALi suggests you refer to
-	  <http://www.ali.com.tw/eng/support/index.shtml>.
+	  <http://www.ali.com.tw/>.
 
 	  The M1541 chipset can do AGP 1x and 2x, but note that there is an
 	  acknowledged incompatibility with Matrox G200 cards. Due to
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index e763d3312ce7..75b763cb3ea1 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -1,7 +1,7 @@
 /*
  * For documentation on the i460 AGP interface, see Chapter 7 (AGP Subsystem) of
  * the "Intel 460GTX Chipset Software Developer's Manual":
- * http://developer.intel.com/design/itanium/downloads/24870401s.htm
+ * http://www.intel.com/design/archives/itanium/downloads/248704.htm 
  */
 /*
  * 460GX support by Chris Ahna <christopher.j.ahna@intel.com>
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index 033e1505fca9..0848ca90255d 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -7,8 +7,8 @@
  *   Intel Corporation, Microsoft Corporation. Advanced Power Management
  *   (APM) BIOS Interface Specification, Revision 1.2, February 1996.
  *
- * [This document is available from Microsoft at:
- *    http://www.microsoft.com/hwdev/busbios/amp_12.htm]
+ * This document is available from Microsoft at:
+ *    http://www.microsoft.com/whdc/archive/amp_12.mspx
  */
 #include <linux/module.h>
 #include <linux/poll.h>
diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c
index 7b98c067190a..3ed20e8abc0d 100644
--- a/drivers/char/ipmi/ipmi_bt_sm.c
+++ b/drivers/char/ipmi/ipmi_bt_sm.c
@@ -2,7 +2,7 @@
  *  ipmi_bt_sm.c
  *
  *  The state machine for an Open IPMI BT sub-driver under ipmi_si.c, part
- *  of the driver architecture at http://sourceforge.net/project/openipmi
+ *  of the driver architecture at http://sourceforge.net/projects/openipmi 
  *
  *  Author:	Rocky Craig <first.last@hp.com>
  *
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index ff68e7c34ce7..2a84379b9104 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1965,8 +1965,7 @@ static int acpi_gpe_irq_setup(struct smi_info *info)
 
 /*
  * Defined at
- * http://h21007.www2.hp.com/dspp/files/unprotected/devresource/
- * Docs/TechPapers/IA64/hpspmi.pdf
+ * http://h21007.www2.hp.com/portal/download/files/unprot/hpspmi.pdf
  */
 struct SPMITable {
 	s8	Signature[4];
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index a98290d7a2c5..88dda0c45ee0 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -4,7 +4,6 @@
  * Copyright by 
  * Philips Automation Projects
  * Kassel (Germany)
- * http://www.pap-philips.de
  * -----------------------------------------------------------
  * This software may be used and distributed according to the terms of
  * the GNU General Public License, incorporated herein by reference.
diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig
index ffa0efce0aed..6614416a8623 100644
--- a/drivers/char/pcmcia/Kconfig
+++ b/drivers/char/pcmcia/Kconfig
@@ -28,7 +28,7 @@ config CARDMAN_4000
 
 	  This kernel driver requires additional userspace support, either
 	  by the vendor-provided PC/SC ifd_handler (http://www.omnikey.com/),
-	  or via the cm4000 backend of OpenCT (http://www.opensc.com/).
+	  or via the cm4000 backend of OpenCT (http://www.opensc-project.org/opensc).
 
 config CARDMAN_4040
 	tristate "Omnikey CardMan 4040 support"
@@ -41,7 +41,7 @@ config CARDMAN_4040
 	  in I/O space.  To use the kernel driver, you will need either the
 	  PC/SC ifdhandler provided from the Omnikey homepage
 	  (http://www.omnikey.com/), or a current development version of OpenCT
-	  (http://www.opensc.org/).
+	  (http://www.opensc-project.org/opensc).
 
 config IPWIRELESS
 	tristate "IPWireless 3G UMTS PCMCIA card support"
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 4dc338f3d1aa..f6595aba4f0f 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -58,6 +58,6 @@ config TCG_INFINEON
 	  To compile this driver as a module, choose M here; the module
 	  will be called tpm_infineon.
 	  Further information on this driver and the supported hardware
-	  can be found at http://www.prosec.rub.de/tpm
+	  can be found at http://www.trust.rub.de/projects/linux-device-driver-infineon-tpm/ 
 
 endif # TCG_TPM
diff --git a/drivers/char/tpm/tpm_infineon.c b/drivers/char/tpm/tpm_infineon.c
index f58440791e65..76da32e11f18 100644
--- a/drivers/char/tpm/tpm_infineon.c
+++ b/drivers/char/tpm/tpm_infineon.c
@@ -7,7 +7,7 @@
  * Copyright (C) 2005, Marcel Selhorst <m.selhorst@sirrix.com>
  * Sirrix AG - security technologies, http://www.sirrix.com and
  * Applied Data Security Group, Ruhr-University Bochum, Germany
- * Project-Homepage: http://www.prosec.rub.de/tpm
+ * Project-Homepage: http://www.trust.rub.de/projects/linux-device-driver-infineon-tpm/ 
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 070968178a24..413f0dfbbf75 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -1,7 +1,7 @@
 /*
  * file for managing the edac_device class of devices for EDAC
  *
- * (C) 2007 SoftwareBitMaker (http://www.softwarebitmaker.com)
+ * (C) 2007 SoftwareBitMaker 
  *
  * This file may be distributed under the terms of the
  * GNU General Public License.
diff --git a/drivers/edac/i82443bxgx_edac.c b/drivers/edac/i82443bxgx_edac.c
index a2fa1feed724..678405ab04e4 100644
--- a/drivers/edac/i82443bxgx_edac.c
+++ b/drivers/edac/i82443bxgx_edac.c
@@ -12,7 +12,7 @@
  * 440GX fix by Jason Uhlenkott <juhlenko@akamai.com>.
  *
  * Written with reference to 82443BX Host Bridge Datasheet:
- * http://www.intel.com/design/chipsets/440/documentation.htm
+ * http://download.intel.com/design/chipsets/datashts/29063301.pdf 
  * references to this document given in [].
  *
  * This module doesn't support the 440LX, but it may be possible to
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index a6c670b8ce52..af39bbd7394d 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -74,7 +74,8 @@ config EFI_PCDP
 
 	  You must also enable the appropriate drivers (serial, VGA, etc.)
 
-	  See <http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf>
+	  See DIG64_HCDPv20_042804.pdf available from
+	  <http://www.dig64.org/specifications/> 
 
 config DELL_RBU
 	tristate "BIOS update support for DELL systems via sysfs"
diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c
index f287fe79edc4..96c25d93eed1 100644
--- a/drivers/firmware/edd.c
+++ b/drivers/firmware/edd.c
@@ -15,7 +15,7 @@
  * made in setup.S, copied to safe structures in setup.c,
  * and presents it in sysfs.
  *
- * Please see http://linux.dell.com/edd30/results.html for
+ * Please see http://linux.dell.com/edd/results.html for
  * the list of BIOSs which have been reported to implement EDD.
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/firmware/pcdp.h b/drivers/firmware/pcdp.h
index ce910d68bd19..e5530608e00d 100644
--- a/drivers/firmware/pcdp.h
+++ b/drivers/firmware/pcdp.h
@@ -1,8 +1,8 @@
 /*
  * Definitions for PCDP-defined console devices
  *
- * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
- * v2.0:  http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
+ * For DIG64_HCDPv10a_01.pdf and DIG64_PCDPv20.pdf (v1.0a and v2.0 resp.),
+ * please see <http://www.dig64.org/specifications/>
  *
  * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
  *	Khalid Aziz <khalid.aziz@hp.com>
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index f1f473ea97d3..045d63e374c3 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(drm_mode_debug_printmodeline);
  * according to the hdisplay, vdisplay, vrefresh.
  * It is based from the VESA(TM) Coordinated Video Timing Generator by
  * Graham Loveridge April 9, 2003 available at
- * http://www.vesa.org/public/CVT/CVTd6r1.xls
+ * http://www.elo.utfsm.cl/~elo212/docs/CVTd6r1.xls 
  *
  * And it is copied from xf86CVTmode in xserver/hw/xfree86/modes/xf86cvt.c.
  * What I have done is to translate it by using integer calculation.
diff --git a/drivers/hwmon/adm1025.c b/drivers/hwmon/adm1025.c
index 251b63165e2a..60befc0ee65f 100644
--- a/drivers/hwmon/adm1025.c
+++ b/drivers/hwmon/adm1025.c
@@ -12,7 +12,7 @@
  * resolution of about 0.5% of the nominal value). Temperature values are
  * reported with a 1 deg resolution and a 3 deg accuracy. Complete
  * datasheet can be obtained from Analog's website at:
- *   http://www.analog.com/Analog_Root/productPage/productHome/0,2121,ADM1025,00.html
+ *   http://www.onsemi.com/PowerSolutions/product.do?id=ADM1025 
  *
  * This driver also supports the ADM1025A, which differs from the ADM1025
  * only in that it has "open-drain VID inputs while the ADM1025 has
diff --git a/drivers/hwmon/adm1026.c b/drivers/hwmon/adm1026.c
index 65335b268fa9..4bf969c0a32b 100644
--- a/drivers/hwmon/adm1026.c
+++ b/drivers/hwmon/adm1026.c
@@ -6,7 +6,7 @@
 
     Chip details at:
 
-    <http://www.analog.com/UploadedFiles/Data_Sheets/779263102ADM1026_a.pdf>
+    <http://www.onsemi.com/PowerSolutions/product.do?id=ADM1026>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
diff --git a/drivers/hwmon/f75375s.c b/drivers/hwmon/f75375s.c
index 0f58ecc5334d..3e2e10084ad9 100644
--- a/drivers/hwmon/f75375s.c
+++ b/drivers/hwmon/f75375s.c
@@ -6,10 +6,10 @@
  * Datasheets available at:
  *
  * f75375:
- * http://www.fintek.com.tw/files/productfiles/2005111152950.pdf
+ * http://www.fintek.com.tw/files/productfiles/F75375_V026P.pdf 
  *
  * f75373:
- * http://www.fintek.com.tw/files/productfiles/2005111153128.pdf
+ * http://www.fintek.com.tw/files/productfiles/F75373_V025P.pdf
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/drivers/hwmon/g760a.c b/drivers/hwmon/g760a.c
index 1f63d1a3af5e..1d6a6fa31fb4 100644
--- a/drivers/hwmon/g760a.c
+++ b/drivers/hwmon/g760a.c
@@ -5,7 +5,7 @@
     Copyright (C) 2007  Herbert Valerio Riedel <hvr@gnu.org>
 
     Complete datasheet is available at GMT's website:
-      http://www.gmt.com.tw/datasheet/g760a.pdf
+      http://www.gmt.com.tw/product/datasheet/EDS-760A.pdf 
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
diff --git a/drivers/hwmon/hwmon-vid.c b/drivers/hwmon/hwmon-vid.c
index bf0862a803c0..2b2ca1694f95 100644
--- a/drivers/hwmon/hwmon-vid.c
+++ b/drivers/hwmon/hwmon-vid.c
@@ -38,7 +38,7 @@
  * available at http://developer.intel.com/.
  *
  * AMD Athlon 64 and AMD Opteron Processors, AMD Publication 26094,
- * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ * http://support.amd.com/us/Processor_TechDocs/26094.PDF 
  * Table 74. VID Code Voltages
  * This corresponds to an arbitrary VRM code of 24 in the functions below.
  * These CPU models (K8 revision <= E) have 5 VID pins. See also:
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 45163693f737..97d98fbf5849 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -12,7 +12,7 @@
  *
  *
  * HighPoint has its own drivers (open source except for the RAID part)
- * available from http://www.highpoint-tech.com/BIOS%20+%20Driver/.
+ * available from http://www.highpoint-tech.com/USA_new/service_support.htm 
  * This may be useful to anyone wanting to work on this driver, however  do not
  * trust  them too much since the code tends to become less and less meaningful
  * as the time passes... :-/
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
index d81e49680c3f..808bcdcbf8e1 100644
--- a/drivers/ide/ht6560b.c
+++ b/drivers/ide/ht6560b.c
@@ -10,7 +10,6 @@
  *  Author:    Mikko Ala-Fossi            <maf@iki.fi>
  *             Jan Evert van Grootheest   <j.e.van.grootheest@caiway.nl>
  *
- *  Try:  http://www.maf.iki.fi/~maf/ht6560b/
  */
 
 #define DRV_NAME	"ht6560b"
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 89d70de5e235..6e35eccc9caa 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -16,7 +16,7 @@ config INFINIBAND_USER_MAD
 	  Userspace InfiniBand Management Datagram (MAD) support.  This
 	  is the kernel side of the userspace MAD support, which allows
 	  userspace processes to send and receive MADs. You will also
-	  need libibumad from <http://www.openib.org>.
+	  need libibumad from <http://www.openfabrics.org/downloads/management/>.
 
 config INFINIBAND_USER_ACCESS
 	tristate "InfiniBand userspace access (verbs and CM)"
@@ -28,7 +28,7 @@ config INFINIBAND_USER_ACCESS
 	  to set up connections and directly access InfiniBand
 	  hardware for fast-path operations.  You will also need
 	  libibverbs, libibcm and a hardware driver library from
-	  <http://www.openib.org>.
+	  <http://www.openfabrics.org/git/>.
 
 config INFINIBAND_USER_MEM
 	bool
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
index 2acec3fadf69..2b6352b85485 100644
--- a/drivers/infiniband/hw/cxgb3/Kconfig
+++ b/drivers/infiniband/hw/cxgb3/Kconfig
@@ -10,7 +10,7 @@ config INFINIBAND_CXGB3
 	  our website at <http://www.chelsio.com>.
 
 	  For customer support, please visit our customer support page at
-	  <http://www.chelsio.com/support.htm>.
+	  <http://www.chelsio.com/support.html>.
 
 	  Please send feedback to <linux-bugs@chelsio.com>.
 
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
index ccb85eaaad75..6b7e6c543534 100644
--- a/drivers/infiniband/hw/cxgb4/Kconfig
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -10,7 +10,7 @@ config INFINIBAND_CXGB4
 	  our website at <http://www.chelsio.com>.
 
 	  For customer support, please visit our customer support page at
-	  <http://www.chelsio.com/support.htm>.
+	  <http://www.chelsio.com/support.html>.
 
 	  Please send feedback to <linux-bugs@chelsio.com>.
 
diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig
index b411c51842da..d00af71a2cfc 100644
--- a/drivers/infiniband/ulp/iser/Kconfig
+++ b/drivers/infiniband/ulp/iser/Kconfig
@@ -9,4 +9,4 @@ config INFINIBAND_ISER
 
 	  The iSER protocol is defined by IETF.
 	  See <http://www.ietf.org/rfc/rfc5046.txt>
-	  and <http://www.infinibandta.org/members/spec/Annex_iSER.PDF>
+	  and <http://members.infinibandta.org/kwspub/spec/Annex_iSER.PDF>
diff --git a/drivers/input/joystick/gamecon.c b/drivers/input/joystick/gamecon.c
index 0ffaf2c77a19..e68e49786483 100644
--- a/drivers/input/joystick/gamecon.c
+++ b/drivers/input/joystick/gamecon.c
@@ -521,9 +521,8 @@ static void gc_multi_process_packet(struct gc *gc)
  * PSX support
  *
  * See documentation at:
- *	http://www.dim.com/~mackys/psxmemcard/ps-eng2.txt
+ *	http://www.geocities.co.jp/Playtown/2004/psx/ps_eng.txt	
  *	http://www.gamesx.com/controldata/psxcont/psxcont.htm
- *	ftp://milano.usal.es/pablo/
  *
  */
 
diff --git a/drivers/input/misc/cm109.c b/drivers/input/misc/cm109.c
index 2b0eba6619bd..b09c7d127219 100644
--- a/drivers/input/misc/cm109.c
+++ b/drivers/input/misc/cm109.c
@@ -259,7 +259,7 @@ static unsigned short keymap_usbph01(int scancode)
 
 /*
  * Keymap for ATCom AU-100
- * http://www.atcom.cn/En_products_AU100.html
+ * http://www.atcom.cn/products.html 
  * http://www.packetizer.com/products/au100/
  * http://www.voip-info.org/wiki/view/AU-100
  *
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig
index c714ca2407f8..bf5fd7f6a313 100644
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -30,6 +30,7 @@ config MOUSE_PS2
 		<http://w1.894.telia.com/~u89404340/touchpad/index.html>
 	  and a new version of GPM at:
 		<http://www.geocities.com/dt_or/gpm/gpm.html>
+		<http://xorg.freedesktop.org/archive/individual/driver/>
 	  to take advantage of the advanced features of the touchpad.
 
 	  If unsure, say Y.
diff --git a/drivers/input/mouse/touchkit_ps2.c b/drivers/input/mouse/touchkit_ps2.c
index 88121c59c3cc..1fd8f5e192f9 100644
--- a/drivers/input/mouse/touchkit_ps2.c
+++ b/drivers/input/mouse/touchkit_ps2.c
@@ -21,8 +21,8 @@
  *
  * Based upon touchkitusb.c
  *
- * Vendor documentation is available in support section of:
- * http://www.egalax.com.tw/
+ * Vendor documentation is available at:
+ * http://home.eeti.com.tw/web20/drivers/Software%20Programming%20Guide_v2.0.pdf 
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/touchscreen/mk712.c b/drivers/input/touchscreen/mk712.c
index efd3aebaba5f..36e57deacd03 100644
--- a/drivers/input/touchscreen/mk712.c
+++ b/drivers/input/touchscreen/mk712.c
@@ -17,7 +17,7 @@
  * found in Gateway AOL Connected Touchpad computers.
  *
  * Documentation for ICS MK712 can be found at:
- *	http://www.icst.com/pdf/mk712.pdf
+ *	http://www.idt.com/products/getDoc.cfm?docID=18713923
  */
 
 /*
diff --git a/drivers/isdn/i4l/isdn_audio.c b/drivers/isdn/i4l/isdn_audio.c
index 861bdf3421f2..d5013935ac62 100644
--- a/drivers/isdn/i4l/isdn_audio.c
+++ b/drivers/isdn/i4l/isdn_audio.c
@@ -439,7 +439,7 @@ isdn_audio_xlaw2adpcm(adpcm_state * s, int fmt, unsigned char *in,
 
 /*
  * Goertzel algorithm.
- * See http://ptolemy.eecs.berkeley.edu/~pino/Ptolemy/papers/96/dtmf_ict/
+ * See http://ptolemy.eecs.berkeley.edu/papers/96/dtmf_ict/ 
  * for more info.
  * Result is stored into an sk_buff and queued up for later
  * evaluation.
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index c42eeb43042d..f153fc20ad6e 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -3,9 +3,9 @@
  *
  * Copyright (C) 2003, 2004 Colin Leroy, Rasmus Rohde, Benjamin Herrenschmidt
  *
- * Documentation from
- * http://www.analog.com/UploadedFiles/Data_Sheets/115254175ADT7467_pra.pdf
- * http://www.analog.com/UploadedFiles/Data_Sheets/3686221171167ADT7460_b.pdf
+ * Documentation from 115254175ADT7467_pra.pdf and 3686221171167ADT7460_b.pdf
+ * http://www.onsemi.com/PowerSolutions/product.do?id=ADT7467
+ * http://www.onsemi.com/PowerSolutions/product.do?id=ADT7460
  *
  */
 
diff --git a/drivers/media/IR/keymaps/rc-manli.c b/drivers/media/IR/keymaps/rc-manli.c
index 1e9fbfa90a1e..0f590b3d01c0 100644
--- a/drivers/media/IR/keymaps/rc-manli.c
+++ b/drivers/media/IR/keymaps/rc-manli.c
@@ -13,7 +13,6 @@
 #include <media/rc-map.h>
 
 /* Michael Tokarev <mjt@tls.msk.ru>
-   http://www.corpit.ru/mjt/beholdTV/remote_control.jpg
    keytable is used by MANLI MTV00[0x0c] and BeholdTV 40[13] at
    least, and probably other cards too.
    The "ascii-art picture" below (in comments, first row
diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c
index a6be529eec5c..f96c4a675c65 100644
--- a/drivers/media/dvb/ttpci/av7110.c
+++ b/drivers/media/dvb/ttpci/av7110.c
@@ -26,7 +26,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 
@@ -2290,12 +2290,7 @@ static int frontend_init(struct av7110 *av7110)
 /* Budgetpatch note:
  * Original hardware design by Roberto Deza:
  * There is a DVB_Wiki at
- * http://212.227.36.83/linuxtv/wiki/index.php/Main_Page
- * where is described this 'DVB TT Budget Patch', on Card Modding:
- * http://212.227.36.83/linuxtv/wiki/index.php/DVB_TT_Budget_Patch
- * On the short description there is also a link to a external file,
- * with more details:
- * http://perso.wanadoo.es/jesussolano/Ttf_tsc1.zip
+ * http://www.linuxtv.org/
  *
  * New software triggering design by Emard that works on
  * original Roberto Deza's hardware:
diff --git a/drivers/media/dvb/ttpci/av7110_av.c b/drivers/media/dvb/ttpci/av7110_av.c
index 13efba942dac..878da6a19fbc 100644
--- a/drivers/media/dvb/ttpci/av7110_av.c
+++ b/drivers/media/dvb/ttpci/av7110_av.c
@@ -25,7 +25,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include <linux/types.h>
diff --git a/drivers/media/dvb/ttpci/av7110_ca.c b/drivers/media/dvb/ttpci/av7110_ca.c
index 4eba35a018e3..7564c2618947 100644
--- a/drivers/media/dvb/ttpci/av7110_ca.c
+++ b/drivers/media/dvb/ttpci/av7110_ca.c
@@ -25,7 +25,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/media/dvb/ttpci/av7110_hw.c b/drivers/media/dvb/ttpci/av7110_hw.c
index e162691b515d..f1cbfe526989 100644
--- a/drivers/media/dvb/ttpci/av7110_hw.c
+++ b/drivers/media/dvb/ttpci/av7110_hw.c
@@ -22,7 +22,7 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 /* for debugging ARM communication: */
diff --git a/drivers/media/dvb/ttpci/av7110_v4l.c b/drivers/media/dvb/ttpci/av7110_v4l.c
index 8986d967d2f4..ac20c5bbfa43 100644
--- a/drivers/media/dvb/ttpci/av7110_v4l.c
+++ b/drivers/media/dvb/ttpci/av7110_v4l.c
@@ -22,7 +22,7 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/media/dvb/ttpci/budget-av.c b/drivers/media/dvb/ttpci/budget-av.c
index 983672aa2450..97afc01f60d0 100644
--- a/drivers/media/dvb/ttpci/budget-av.c
+++ b/drivers/media/dvb/ttpci/budget-av.c
@@ -30,7 +30,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include "budget.h"
diff --git a/drivers/media/dvb/ttpci/budget-ci.c b/drivers/media/dvb/ttpci/budget-ci.c
index 13ac9e3ab121..a9c2c326df4b 100644
--- a/drivers/media/dvb/ttpci/budget-ci.c
+++ b/drivers/media/dvb/ttpci/budget-ci.c
@@ -26,7 +26,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include <linux/module.h>
diff --git a/drivers/media/dvb/ttpci/budget-core.c b/drivers/media/dvb/ttpci/budget-core.c
index ba18e56d5f11..054661315311 100644
--- a/drivers/media/dvb/ttpci/budget-core.c
+++ b/drivers/media/dvb/ttpci/budget-core.c
@@ -31,7 +31,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 
diff --git a/drivers/media/dvb/ttpci/budget-patch.c b/drivers/media/dvb/ttpci/budget-patch.c
index 9c92f9ddd223..579835590690 100644
--- a/drivers/media/dvb/ttpci/budget-patch.c
+++ b/drivers/media/dvb/ttpci/budget-patch.c
@@ -27,7 +27,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include "av7110.h"
diff --git a/drivers/media/dvb/ttpci/budget.c b/drivers/media/dvb/ttpci/budget.c
index 874a10a9d493..d238fb9371a7 100644
--- a/drivers/media/dvb/ttpci/budget.c
+++ b/drivers/media/dvb/ttpci/budget.c
@@ -31,7 +31,7 @@
  * Or, point your browser to http://www.gnu.org/copyleft/gpl.html
  *
  *
- * the project's page is at http://www.linuxtv.org/dvb/
+ * the project's page is at http://www.linuxtv.org/ 
  */
 
 #include "budget.h"
diff --git a/drivers/media/radio/radio-maxiradio.c b/drivers/media/radio/radio-maxiradio.c
index 4349213b403b..255d40df4b46 100644
--- a/drivers/media/radio/radio-maxiradio.c
+++ b/drivers/media/radio/radio-maxiradio.c
@@ -13,7 +13,7 @@
  * anybody does please mail me.
  *
  * For the pdf file see:
- * http://www.semiconductors.philips.com/pip/TEA5757H/V1
+ * http://www.nxp.com/acrobat_download2/expired_datasheets/TEA5757_5759_3.pdf 
  *
  *
  * CHANGES:
diff --git a/drivers/media/radio/radio-typhoon.c b/drivers/media/radio/radio-typhoon.c
index 03439282dfce..b1f630527dc1 100644
--- a/drivers/media/radio/radio-typhoon.c
+++ b/drivers/media/radio/radio-typhoon.c
@@ -1,9 +1,6 @@
 /* Typhoon Radio Card driver for radio support
  * (c) 1999 Dr. Henrik Seidel <Henrik.Seidel@gmx.de>
  *
- * Card manufacturer:
- * http://194.18.155.92/idc/prod2.idc?nr=50753&lang=e
- *
  * Notes on the hardware
  *
  * This card has two output sockets, one for speakers and one for line.
diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index f6e4d0475351..d000522cb0f4 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -978,7 +978,7 @@ config USB_STKWEBCAM
 	  Supported devices are typically found in some Asus laptops,
 	  with USB id 174f:a311 and 05e1:0501. Other Syntek cameras
 	  may be supported by the stk11xx driver, from which this is
-	  derived, see http://stk11xx.sourceforge.net
+	  derived, see <http://sourceforge.net/projects/syntekdriver/> 
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called stkwebcam.
diff --git a/drivers/media/video/cafe_ccic.c b/drivers/media/video/cafe_ccic.c
index be35e6965829..9536f1a40dd2 100644
--- a/drivers/media/video/cafe_ccic.c
+++ b/drivers/media/video/cafe_ccic.c
@@ -4,7 +4,7 @@
  * sensor.
  *
  * The data sheet for this device can be found at:
- *    http://www.marvell.com/products/pcconn/88ALP01.jsp
+ *    http://www.marvell.com/products/pc_connectivity/88alp01/ 
  *
  * Copyright 2006 One Laptop Per Child Association, Inc.
  * Copyright 2006-7 Jonathan Corbet <corbet@lwn.net>
diff --git a/drivers/media/video/cx18/cx18-cards.c b/drivers/media/video/cx18/cx18-cards.c
index 6b805afe5d20..fe1090940b01 100644
--- a/drivers/media/video/cx18/cx18-cards.c
+++ b/drivers/media/video/cx18/cx18-cards.c
@@ -39,7 +39,7 @@ static struct cx18_card_tuner_i2c cx18_i2c_std = {
 	.tv    = { 0x61, 0x60, I2C_CLIENT_END },
 };
 
-/* Please add new PCI IDs to: http://pci-ids.ucw.cz/iii
+/* Please add new PCI IDs to: http://pci-ids.ucw.cz/ 
    This keeps the PCI ID database up to date. Note that the entries
    must be added under vendor 0x4444 (Conexant) as subsystem IDs.
    New vendor IDs should still be added to the vendor ID list. */
diff --git a/drivers/media/video/cx23885/cx23885-417.c b/drivers/media/video/cx23885/cx23885-417.c
index abd64e89f60f..53a67824071b 100644
--- a/drivers/media/video/cx23885/cx23885-417.c
+++ b/drivers/media/video/cx23885/cx23885-417.c
@@ -7,7 +7,7 @@
  *    (c) 2008 Steven Toth <stoth@linuxtv.org>
  *      - CX23885/7/8 support
  *
- *  Includes parts from the ivtv driver( http://ivtv.sourceforge.net/),
+ *  Includes parts from the ivtv driver <http://sourceforge.net/projects/ivtv/>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c
index e46e1ceef72c..660b2a927feb 100644
--- a/drivers/media/video/cx88/cx88-blackbird.c
+++ b/drivers/media/video/cx88/cx88-blackbird.c
@@ -9,7 +9,7 @@
  *    (c) 2005-2006 Mauro Carvalho Chehab <mchehab@infradead.org>
  *        - video_ioctl2 conversion
  *
- *  Includes parts from the ivtv driver( http://ivtv.sourceforge.net/),
+ *  Includes parts from the ivtv driver <http://sourceforge.net/projects/ivtv/>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
diff --git a/drivers/media/video/ivtv/ivtv-cards.c b/drivers/media/video/ivtv/ivtv-cards.c
index ca1fd3227a93..87afbbee2063 100644
--- a/drivers/media/video/ivtv/ivtv-cards.c
+++ b/drivers/media/video/ivtv/ivtv-cards.c
@@ -65,7 +65,7 @@ static struct ivtv_card_tuner_i2c ivtv_i2c_tda8290 = {
 
 /********************** card configuration *******************************/
 
-/* Please add new PCI IDs to: http://pci-ids.ucw.cz/iii
+/* Please add new PCI IDs to: http://pci-ids.ucw.cz/ 
    This keeps the PCI ID database up to date. Note that the entries
    must be added under vendor 0x4444 (Conexant) as subsystem IDs.
    New vendor IDs should still be added to the vendor ID list. */
diff --git a/drivers/media/video/mxb.c b/drivers/media/video/mxb.c
index ef0c8178f255..b1dbcf1d2bcb 100644
--- a/drivers/media/video/mxb.c
+++ b/drivers/media/video/mxb.c
@@ -3,7 +3,7 @@
 
     Copyright (C) 1998-2006 Michael Hunold <michael@mihu.de>
 
-    Visit http://www.mihu.de/linux/saa7146/mxb/
+    Visit http://www.themm.net/~mihu/linux/saa7146/mxb.html 
     for further details about this card.
 
     This program is free software; you can redistribute it and/or modify
diff --git a/drivers/media/video/sn9c102/sn9c102_pas202bcb.c b/drivers/media/video/sn9c102/sn9c102_pas202bcb.c
index 2782f94cf6f8..2e86fdc86989 100644
--- a/drivers/media/video/sn9c102/sn9c102_pas202bcb.c
+++ b/drivers/media/video/sn9c102/sn9c102_pas202bcb.c
@@ -4,7 +4,6 @@
  *                                                                         *
  * Copyright (C) 2004 by Carlos Eduardo Medaglia Dyonisio                  *
  *                       <medaglia@undl.org.br>                            *
- *                       http://cadu.homelinux.com:8080/                   *
  *                                                                         *
  * Support for SN9C103, DAC Magnitude, exposure and green gain controls    *
  * added by Luca Risolia <luca.risolia@studio.unibo.it>                    *
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 9df5b759a00b..0c31927c1562 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -112,8 +112,8 @@ config IBM_ASM
 
 	  WARNING: This software may not be supported or function
 	  correctly on your IBM server. Please consult the IBM ServerProven
-	  website <http://www.pc.ibm.com/ww/eserver/xseries/serverproven> for
-	  information on the specific driver level and support statement
+	  website <http://www-03.ibm.com/systems/info/x86servers/serverproven/compat/us/>
+	  for information on the specific driver level and support statement
 	  for your IBM server.
 
 config PHANTOM
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 3e6c47bdce53..ba29d2f0ffd7 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -418,8 +418,8 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 
 			/*
 			 * Valid primary extension versions are: 1.0, 1.1, 1.2, 1.3, 1.4
-			 * see: http://www.amd.com/us-en/assets/content_type/DownloadableAssets/cfi_r20.pdf, page 19
-			 *      http://www.amd.com/us-en/assets/content_type/DownloadableAssets/cfi_100_20011201.pdf
+			 * see: http://cs.ozerki.net/zap/pub/axim-x5/docs/cfi_r20.pdf, page 19 
+			 *      http://www.spansion.com/Support/AppNotes/cfi_100_20011201.pdf
 			 *      http://www.spansion.com/Support/Datasheets/s29ws-p_00_a12_e.pdf
 			 */
 			if (extp->MajorVersion != '1' ||
diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c
index f4359fe7150f..caf604167f03 100644
--- a/drivers/mtd/devices/lart.c
+++ b/drivers/mtd/devices/lart.c
@@ -17,7 +17,7 @@
  *           - January 2000
  *
  *    [2] MTD internal API documentation
- *           - http://www.linux-mtd.infradead.org/tech/
+ *           - http://www.linux-mtd.infradead.org/ 
  *
  * Limitations:
  *
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index 4d6a64c387ec..037b399df3f1 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -51,7 +51,7 @@
 
     Use of the FTL format for non-PCMCIA applications may be an
     infringement of these patents.  For additional information,
-    contact M-Systems (http://www.m-sys.com) directly.
+    contact M-Systems directly. M-Systems since acquired by Sandisk. 
 
 ======================================================================*/
 #include <linux/mtd/blktrans.h>
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 701d942c6795..962212628f6e 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -172,7 +172,7 @@ config MTD_OCTAGON
 	  This provides a 'mapping' driver which supports the way in which
 	  the flash chips are connected in the Octagon-5066 Single Board
 	  Computer. More information on the board is available at
-	  <http://www.octagonsystems.com/CPUpages/5066.html>.
+	  <http://www.octagonsystems.com/products/5066.aspx>.
 
 config MTD_VMAX
 	tristate "JEDEC Flash device mapped on Tempustech VMAX SBC301"
@@ -284,7 +284,7 @@ config MTD_TQM8XXL
 	  chips, currently uses AMD one. This 'mapping' driver supports
 	  that arrangement, allowing the CFI probe and command set driver
 	  code to communicate with the chips on the TQM8xxL board. More at
-	  <http://www.denx.de/embedded-ppc-en.html>.
+	  <http://www.denx.de/wiki/PPCEmbedded/>.
 
 config MTD_RPXLITE
 	tristate "CFI Flash device mapped on RPX Lite or CLLF"
diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c
index db1dfc5a1b11..e06c8983978e 100644
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -2,7 +2,7 @@
  * Driver for One Laptop Per Child ‘CAFÉ’ controller, aka Marvell 88ALP01
  *
  * The data sheet for this device can be found at:
- *    http://www.marvell.com/products/pcconn/88ALP01.jsp
+ *    http://wiki.laptop.org/go/Datasheets 
  *
  * Copyright © 2006 Red Hat, Inc.
  * Copyright © 2006 David Woodhouse <dwmw2@infradead.org>
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 5a6895320b48..2a34e214a7f9 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -921,7 +921,7 @@ config SMC91X
 	  including the SMC91C94 and the SMC91C111. Say Y if you want it
 	  compiled into the kernel, and read the file
 	  <file:Documentation/networking/smc9.txt>  and the Ethernet-HOWTO,
-	  available from  <http://www.linuxdoc.org/docs.html#howto>.
+	  available from  <http://www.tldp.org/docs.html#howto>.
 
 	  This driver is also available as a module ( = code which can be
 	  inserted in and removed from the running kernel whenever you want).
@@ -1021,7 +1021,7 @@ config SMC911X
 	  including the new LAN9115, LAN9116, LAN9117, and LAN9118.
 	  Say Y if you want it compiled into the kernel, 
 	  and read the Ethernet-HOWTO, available from
-	  <http://www.linuxdoc.org/docs.html#howto>.
+	  <http://www.tldp.org/docs.html#howto>.
 
 	  This driver is also available as a module. The module will be 
 	  called smc911x.  If you want to compile it as a module, say M 
@@ -1503,7 +1503,7 @@ config E100
 
 	  For the latest Intel PRO/100 network driver for Linux, see:
 
-	  <http://appsr.intel.com/scripts-df/support_intel.asp>
+	  <http://www.intel.com/p/en_US/support/highlights/network/pro100plus>
 
 	  More specific information on configuring the driver is in 
 	  <file:Documentation/networking/e100.txt>.
@@ -1529,9 +1529,8 @@ config FEALNX
 	select CRC32
 	select MII
 	help
-	  Say Y here to support the Mysom MTD-800 family of PCI-based Ethernet
-	  cards. Specifications and data at
-	  <http://www.myson.com.hk/mtd/datasheet/>.
+	  Say Y here to support the Myson MTD-800 family of PCI-based Ethernet 
+	  cards. <http://www.myson.com.tw/>
 
 config NATSEMI
 	tristate "National Semiconductor DP8381x series PCI Ethernet support"
@@ -1705,7 +1704,7 @@ config SMSC9420
 	  This is a driver for SMSC's LAN9420 PCI ethernet adapter.
 	  Say Y if you want it compiled into the kernel,
 	  and read the Ethernet-HOWTO, available from
-	  <http://www.linuxdoc.org/docs.html#howto>.
+	  <http://www.tldp.org/docs.html#howto>.
 
 	  This driver is also available as a module. The module will be
 	  called smsc9420.  If you want to compile it as a module, say M
@@ -2540,7 +2539,7 @@ config CHELSIO_T1
           our website at <http://www.chelsio.com>.
 
           For customer support, please visit our customer support page at
-          <http://www.chelsio.com/support.htm>.
+          <http://www.chelsio.com/support.html>.
 
           Please send feedback to <linux-bugs@chelsio.com>.
 
@@ -2572,7 +2571,7 @@ config CHELSIO_T3
 	  our website at <http://www.chelsio.com>.
 
 	  For customer support, please visit our customer support page at
-	  <http://www.chelsio.com/support.htm>.
+	  <http://www.chelsio.com/support.html>.
 
 	  Please send feedback to <linux-bugs@chelsio.com>.
 
@@ -2597,7 +2596,7 @@ config CHELSIO_T4
 	  our website at <http://www.chelsio.com>.
 
 	  For customer support, please visit our customer support page at
-	  <http://www.chelsio.com/support.htm>.
+	  <http://www.chelsio.com/support.html>.
 
 	  Please send feedback to <linux-bugs@chelsio.com>.
 
@@ -2620,7 +2619,7 @@ config CHELSIO_T4VF
 	  our website at <http://www.chelsio.com>.
 
 	  For customer support, please visit our customer support page at
-	  <http://www.chelsio.com/support.htm>.
+	  <http://www.chelsio.com/support.html>.
 
 	  Please send feedback to <linux-bugs@chelsio.com>.
 
diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig
index 0a0e0cd81a23..f5a89164e779 100644
--- a/drivers/net/appletalk/Kconfig
+++ b/drivers/net/appletalk/Kconfig
@@ -18,7 +18,7 @@ config ATALK
 
 	  General information about how to connect Linux, Windows machines and
 	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.  The
-	  NET-3-HOWTO, available from
+	  NET3-4-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>, contains valuable
 	  information as well.
 
diff --git a/drivers/net/atp.c b/drivers/net/atp.c
index bd2f9d331dac..0eee04426697 100644
--- a/drivers/net/atp.c
+++ b/drivers/net/atp.c
@@ -68,7 +68,7 @@ static int xcvr[NUM_UNITS]; 			/* The data transfer mode. */
 
 	In 1997 Realtek made available the documentation for the second generation
 	RTL8012 chip, which has lead to several driver improvements.
-	  http://www.realtek.com.tw/cn/cn.html
+	  http://www.realtek.com.tw/
 
 					Theory of Operation
 
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index 57c8ac0ef3f1..c6e25716b2c2 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -131,8 +131,8 @@ IIIa. Ring buffers
 
 IVb. References
 
-http://www.smsc.com/main/tools/discontinued/83c171.pdf
-http://www.smsc.com/main/tools/discontinued/83c175.pdf
+http://www.smsc.com/media/Downloads_Public/discontinued/83c171.pdf
+http://www.smsc.com/media/Downloads_Public/discontinued/83c175.pdf
 http://scyld.com/expert/NWay.html
 http://www.national.com/pf/DP/DP83840A.html
 
diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig
index 62d5d5cfd6a6..95dbcfdf131d 100644
--- a/drivers/net/hamradio/Kconfig
+++ b/drivers/net/hamradio/Kconfig
@@ -73,7 +73,7 @@ config DMASCC
 	  certain parameters, such as channel access timing, clock mode, and
 	  DMA channel. This is accomplished with a small utility program,
 	  dmascc_cfg, available at
-	  <http://cacofonix.nt.tuwien.ac.at/~oe1kib/Linux/>. Please be sure to
+	  <http://www.linux-ax25.org/wiki/Ax25-tools>. Please be sure to
 	  get at least version 1.27 of dmascc_cfg, as older versions will not
 	  work with the current driver.
 
diff --git a/drivers/net/ibmlana.c b/drivers/net/ibmlana.c
index 294ccfb427cf..8de327321c41 100644
--- a/drivers/net/ibmlana.c
+++ b/drivers/net/ibmlana.c
@@ -23,7 +23,7 @@ paper sources:
   'LAN Technical Reference Ethernet Adapter Interface Version 1 Release 1.0
    Document Number SC30-3661-00' by IBM for info on the adapter itself
 
-  Also see http://www.natsemi.com/
+  Also see http://www.national.com/analog 
 
 special acknowledgements to:
   - Bob Eager for helping me out with documentation from IBM
diff --git a/drivers/net/irda/donauboe.h b/drivers/net/irda/donauboe.h
index 36c3060411d2..4dc39e5f0156 100644
--- a/drivers/net/irda/donauboe.h
+++ b/drivers/net/irda/donauboe.h
@@ -54,7 +54,7 @@
 /* anyone who has. HOWEVER the chip bears a striking resemblence */
 /* to the IrDA controller in the Toshiba RISC TMPR3922 chip      */
 /* the documentation for this is freely available at             */
-/* http://www.toshiba.com/taec/components/Generic/TMPR3922.shtml */
+/* http://www.madingley.org/james/resources/toshoboe/TMPR3922.pdf */
 /* The mapping between the registers in that document and the    */
 /* Registers in the 701 oboe chip are as follows    */
 
diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c
index 56f3fc45dbaa..627b619742e3 100644
--- a/drivers/net/pci-skeleton.c
+++ b/drivers/net/pci-skeleton.c
@@ -78,7 +78,7 @@ that almost all frames will need to be copied to an alignment buffer.
 
 IVb. References
 
-http://www.realtek.com.tw/cn/cn.html
+http://www.realtek.com.tw/
 http://www.scyld.com/expert/NWay.html
 
 IVc. Errata
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index c683f77c6f42..e09267965aad 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -62,7 +62,7 @@ invalid ramWidth is Very Bad.
 V. References
 
 http://www.scyld.com/expert/NWay.html
-http://www.national.com/pf/DP/DP83840.html
+http://www.national.com/opf/DP/DP83840A.html
 
 Thanks to Terry Murphy of 3Com for providing development information for
 earlier 3Com products.
diff --git a/drivers/net/sc92031.c b/drivers/net/sc92031.c
index 8c4067af32b0..12fb5607176c 100644
--- a/drivers/net/sc92031.c
+++ b/drivers/net/sc92031.c
@@ -15,7 +15,7 @@
  *  Rewritten for 2.6 by Cesar Eduardo Barros
  *
  *  A datasheet for this chip can be found at
- *  http://www.silan.com.cn/english/products/pdf/SC92031AY.pdf
+ *  http://www.silan.com.cn/english/product/pdf/SC92031AY.pdf 
  */
 
 /* Note about set_mac_address: I don't know how to change the hardware
diff --git a/drivers/net/tlan.c b/drivers/net/tlan.c
index ccee3eddc5f4..ef000b20d8af 100644
--- a/drivers/net/tlan.c
+++ b/drivers/net/tlan.c
@@ -78,7 +78,7 @@
  * 			     - Updated tlan.txt accordingly.
  * 			     - Adjusted minimum/maximum frame length.
  * 			     - There is now a TLAN website up at
- * 			       http://tlan.kernel.dk
+ * 			       http://hp.sourceforge.net/ 
  *
  * 	v1.7 April 07, 2000  - Started to implement custom ioctls. Driver now
  * 			       reports PHY information when used with Donald
diff --git a/drivers/net/tokenring/tms380tr.c b/drivers/net/tokenring/tms380tr.c
index 435ef7d5470f..ccc16d6f5266 100644
--- a/drivers/net/tokenring/tms380tr.c
+++ b/drivers/net/tokenring/tms380tr.c
@@ -5,7 +5,7 @@
  *  Originally sktr.c: Written 1997 by Christoph Goos
  *
  *  A fine result of the Linux Systems Network Architecture Project.
- *  http://www.linux-sna.org
+ *  http://www.vanheusden.com/sna/ 
  *
  *  This software may be used and distributed according to the terms
  *  of the GNU General Public License, incorporated herein by reference.
diff --git a/drivers/net/tulip/Kconfig b/drivers/net/tulip/Kconfig
index 516713fa0a05..14c02e12139f 100644
--- a/drivers/net/tulip/Kconfig
+++ b/drivers/net/tulip/Kconfig
@@ -151,7 +151,7 @@ config ULI526X
 	select CRC32
 	---help---
 	  This driver is for ULi M5261/M5263 10/100M Ethernet Controller
-	  (<http://www.uli.com.tw/>).
+	  (<http://www.nvidia.com/page/uli_drivers.html>).
 
 	  To compile this driver as a module, choose M here. The module will
 	  be called uli526x.
diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c
index 08555f8b15f4..08ad269f6b4e 100644
--- a/drivers/net/usb/plusb.c
+++ b/drivers/net/usb/plusb.c
@@ -32,7 +32,7 @@
 
 
 /*
- * Prolific PL-2301/PL-2302 driver ... http://www.prolifictech.com
+ * Prolific PL-2301/PL-2302 driver ... http://www.prolific.com.tw/ 
  *
  * The protocol and handshaking used here should be bug-compatible
  * with the Linux 2.2 "plusb" driver, by Deti Fliegl.
diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig
index d08ce6a264cb..423eb26386c8 100644
--- a/drivers/net/wan/Kconfig
+++ b/drivers/net/wan/Kconfig
@@ -409,7 +409,7 @@ config CYCLADES_SYNC
 	tristate "Cyclom 2X(tm) cards (EXPERIMENTAL)"
 	depends on WAN_ROUTER_DRIVERS && (PCI || ISA)
 	---help---
-	  Cyclom 2X from Cyclades Corporation <http://www.cyclades.com/> is an
+	  Cyclom 2X from Cyclades Corporation <http://www.avocent.com/> is an
 	  intelligent multiprotocol WAN adapter with data transfer rates up to
 	  512 Kbps. These cards support the X.25 and SNA related protocols.
 
diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h
index ea6362a8988d..97659a077af8 100644
--- a/drivers/net/wireless/ath/ath5k/ath5k.h
+++ b/drivers/net/wireless/ath/ath5k/ath5k.h
@@ -351,7 +351,7 @@ struct ath5k_srev_name {
 /*
  * Some of this information is based on Documentation from:
  *
- * http://madwifi.org/wiki/ChipsetFeatures/SuperAG
+ * http://madwifi-project.org/wiki/ChipsetFeatures/SuperAG 
  *
  * Modulation for Atheros' eXtended Range - range enhancing extension that is
  * supposed to double the distance an Atheros client device can keep a
diff --git a/drivers/net/wireless/ath/ath5k/reg.h b/drivers/net/wireless/ath/ath5k/reg.h
index 55b4ac6d236f..69a9b34ed456 100644
--- a/drivers/net/wireless/ath/ath5k/reg.h
+++ b/drivers/net/wireless/ath/ath5k/reg.h
@@ -26,7 +26,6 @@
  * Atheros presentations and papers like these:
  *
  * 5210 - http://nova.stanford.edu/~bbaas/ps/isscc2002_slides.pdf
- *        http://www.it.iitb.ac.in/~janak/wifire/01222734.pdf
  *
  * 5211 - http://www.hotchips.org/archives/hc14/3_Tue/16_mcfarland.pdf
  *
diff --git a/drivers/net/wireless/p54/Kconfig b/drivers/net/wireless/p54/Kconfig
index b0342a520bf1..8e823796cb94 100644
--- a/drivers/net/wireless/p54/Kconfig
+++ b/drivers/net/wireless/p54/Kconfig
@@ -8,7 +8,7 @@ config P54_COMMON
 	  also need to be enabled in order to support any devices.
 
 	  These devices require softmac firmware which can be found at
-	  http://prism54.org/
+	  <http://wireless.kernel.org/en/users/Drivers/p54>
 
 	  If you choose to build a module, it'll be called p54common.
 
@@ -20,7 +20,7 @@ config P54_USB
 	  This driver is for USB isl38xx based wireless cards.
 
 	  These devices require softmac firmware which can be found at
-	  http://prism54.org/
+	  <http://wireless.kernel.org/en/users/Drivers/p54>
 
 	  If you choose to build a module, it'll be called p54usb.
 
@@ -34,7 +34,7 @@ config P54_PCI
 	  supported by the fullmac driver/firmware.
 
 	  This driver requires softmac firmware which can be found at
-	  http://prism54.org/
+	  <http://wireless.kernel.org/en/users/Drivers/p54>
 
 	  If you choose to build a module, it'll be called p54pci.
 
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index dc14420a9adc..b5e64d71b7a6 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -38,7 +38,7 @@ module_param(init_pcitm, int, 0);
 /* In this order: vendor, device, subvendor, subdevice, class, class_mask,
  * driver_data
  * If you have an update for this please contact prism54-devel@prism54.org
- * The latest list can be found at http://prism54.org/supported_cards.php */
+ * The latest list can be found at http://wireless.kernel.org/en/users/Drivers/p54 */
 static DEFINE_PCI_DEVICE_TABLE(prism54_id_tbl) = {
 	/* Intersil PRISM Duette/Prism GT Wireless LAN adapter */
 	{
diff --git a/drivers/parisc/README.dino b/drivers/parisc/README.dino
index 097324f34bbe..1627426996c1 100644
--- a/drivers/parisc/README.dino
+++ b/drivers/parisc/README.dino
@@ -10,8 +10,7 @@
 ** PCI bus. HP-supplied graphics cards that utilize the PCI bus are
 ** not affected."
 **
-** REVISIT: "go/pci_defect" link below is stale.
-**	HP Internal can use <http://hpfcdma.fc.hp.com:80/Dino/>
+** http://h20000.www2.hp.com/bizsupport/TechSupport/Home.jsp?locale=en_US&prodTypeId=12454&prodSeriesId=44443
 **
 **	Product		First Good Serial Number
 **  C200/C240 (US)	US67350000
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 89ed181cd90c..dcc333f2cb6a 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -206,6 +206,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 	PCI_DEVICE_ID_INTEL_82439TX, 	quir
  *	VIA Apollo KT133 needs PCI latency patch
  *	Made according to a windows driver based patch by George E. Breese
  *	see PCI Latency Adjust on http://www.viahardware.com/download/viatweak.shtm
+ *	and http://www.georgebreese.com/net/software/#PCI
  *      Also see http://www.au-ja.org/review-kt133a-1-en.phtml for
  *      the info on which Mr Breese based his work.
  *
@@ -996,7 +997,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TOSHIBA,	0x605,	quirk_transparent_bridge)
 /*
  * Common misconfiguration of the MediaGX/Geode PCI master that will
  * reduce PCI bandwidth from 70MB/s to 25MB/s.  See the GXM/GXLV/GX1
- * datasheets found at http://www.national.com/ds/GX for info on what
+ * datasheets found at http://www.national.com/analog for info on what
  * these bits do.  <christer@weinigel.se>
  */
 static void quirk_mediagx_master(struct pci_dev *dev)
diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c
index 414d9a6f9a32..91a722518289 100644
--- a/drivers/pcmcia/yenta_socket.c
+++ b/drivers/pcmcia/yenta_socket.c
@@ -1073,7 +1073,7 @@ static void yenta_config_init(struct yenta_socket *socket)
  * invisible during PCI scans because of a misconfigured subordinate number
  * of the parent brige - some BIOSes seem to be too lazy to set it right.
  * Does the fixup carefully by checking how far it can go without conflicts.
- * See http\://bugzilla.kernel.org/show_bug.cgi?id=2944 for more information.
+ * See http://bugzilla.kernel.org/show_bug.cgi?id=2944 for more information.
  */
 static void yenta_fixup_parent_bridge(struct pci_bus *cardbus_bridge)
 {
diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c
index 2d8ac43f78e8..bc89f392a629 100644
--- a/drivers/pnp/pnpbios/proc.c
+++ b/drivers/pnp/pnpbios/proc.c
@@ -11,7 +11,6 @@
  *
  * The .../escd file is utilized by the lsescd utility written by
  * Gunther Mayer.
- *     http://home.t-online.de/home/gunther.mayer/lsescd
  *
  * The .../legacy_device_resources file is not used yet.
  *
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 158284f05732..85da296a49be 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -116,7 +116,7 @@ config CHR_DEV_OSST
 	  <http://www.tldp.org/docs.html#howto>  and
 	  <file:Documentation/scsi/osst.txt>  in the kernel source.
 	  More info on the OnStream driver may be found on
-	  <http://linux1.onstream.nl/test/>
+	  <http://sourceforge.net/projects/osst/>
 	  Please also have a look at the standard st docu, as most of it
 	  applies to osst as well.
 
@@ -156,9 +156,9 @@ config CHR_DEV_SG
 	  directly, so you need some additional software which knows how to
 	  talk to these devices using the SCSI protocol:
 
-	  For scanners, look at SANE (<http://www.mostang.com/sane/>). For CD
+	  For scanners, look at SANE (<http://www.sane-project.org/>). For CD
 	  writer software look at Cdrtools
-	  (<http://www.fokus.gmd.de/research/cc/glone/employees/joerg.schilling/private/cdrecord.html>)
+	  (<http://cdrecord.berlios.de/private/cdrecord.html>)
 	  and for burning a "disk at once": CDRDAO
 	  (<http://cdrdao.sourceforge.net/>). Cdparanoia is a high
 	  quality digital reader of audio CDs (<http://www.xiph.org/paranoia/>).
@@ -942,6 +942,7 @@ config SCSI_IPS
 	---help---
 	  This is support for the IBM ServeRAID hardware RAID controllers.
 	  See <http://www.developer.ibm.com/welcome/netfinity/serveraid.html>
+	  and <http://www-947.ibm.com/support/entry/portal/docdisplay?brand=5000008&lndocid=SERV-RAID>
 	  for more information.  If this driver does not work correctly
 	  without modification please contact the author by email at
 	  <ipslinux@adaptec.com>.
@@ -1601,7 +1602,7 @@ config SCSI_DEBUG
 	  host adapter with one dummy SCSI disk. Each dummy disk uses kernel
 	  RAM as storage (i.e. it is a ramdisk). To save space when multiple
 	  dummy disks are simulated, they share the same kernel RAM for 
-	  their storage. See <http://www.torque.net/sg/sdebug.html> for more
+	  their storage. See <http://sg.danny.cz/sg/sdebug26.html> for more
 	  information. This driver is primarily of use to those testing the
 	  SCSI and block subsystems. If unsure, say N.
 
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 24110f6f61e0..131c95f5476a 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -924,7 +924,7 @@ static int broken_efr(struct uart_8250_port *up)
 	/*
 	 * Exar ST16C2550 "A2" devices incorrectly detect as
 	 * having an EFR, and report an ID of 0x0201.  See
-	 * http://www.exar.com/info.php?pdf=dan180_oct2004.pdf
+	 * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html 
 	 */
 	if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
 		return 1;
diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index e57fb3d228e2..81987a7b9aa3 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -10,7 +10,7 @@
 
 /*
  * This driver and the hardware supported are in term of EE-191 of ADI.
- * http://www.analog.com/UploadedFiles/Application_Notes/399447663EE191.pdf
+ * http://www.analog.com/static/imported-files/application_notes/EE191.pdf 
  * This application note describe how to implement a UART on a Sharc DSP,
  * but this driver is implemented on Blackfin Processor.
  * Transmit Frame Sync is not used by this driver to transfer data out.
diff --git a/drivers/serial/bfin_sport_uart.h b/drivers/serial/bfin_sport_uart.h
index 9ce253e381d2..6d06ce1d5675 100644
--- a/drivers/serial/bfin_sport_uart.h
+++ b/drivers/serial/bfin_sport_uart.h
@@ -10,7 +10,7 @@
 
 /*
  * This driver and the hardware supported are in term of EE-191 of ADI.
- * http://www.analog.com/UploadedFiles/Application_Notes/399447663EE191.pdf
+ * http://www.analog.com/static/imported-files/application_notes/EE191.pdf 
  * This application note describe how to implement a UART on a Sharc DSP,
  * but this driver is implemented on Blackfin Processor.
  * Transmit Frame Sync is not used by this driver to transfer data out.
diff --git a/drivers/serial/uartlite.c b/drivers/serial/uartlite.c
index caf085d3a76a..c761b2223129 100644
--- a/drivers/serial/uartlite.c
+++ b/drivers/serial/uartlite.c
@@ -44,7 +44,7 @@ MODULE_DEVICE_TABLE(of, ulite_of_match);
  * Register definitions
  *
  * For register details see datasheet:
- * http://www.xilinx.com/bvdocs/ipcenter/data_sheet/opb_uartlite.pdf
+ * http://www.xilinx.com/support/documentation/ip_documentation/opb_uartlite.pdf 
  */
 
 #define ULITE_RX		0x00
diff --git a/drivers/staging/asus_oled/README b/drivers/staging/asus_oled/README
index 96b9717f168f..0d82a6d5fa58 100644
--- a/drivers/staging/asus_oled/README
+++ b/drivers/staging/asus_oled/README
@@ -2,7 +2,7 @@
     Driver for Asus OLED display present in some Asus laptops.
 
     The code of this driver is based on 'asusoled' program taken from
-    https://launchpad.net/asusoled/. I just wanted to have a simple
+    <http://lapsus.berlios.de/asus_oled.html>. I just wanted to have a simple
     kernel driver for controlling this device, but I didn't know how
     to do that. Now I know ;) Also, that program can not be used
     with usbhid loaded, which means no USB mouse/keyboard while
diff --git a/drivers/staging/asus_oled/asus_oled.c b/drivers/staging/asus_oled/asus_oled.c
index 5b279fb30f3f..8c95d8c2a4f4 100644
--- a/drivers/staging/asus_oled/asus_oled.c
+++ b/drivers/staging/asus_oled/asus_oled.c
@@ -24,7 +24,7 @@
  *
  *
  *  Asus OLED support is based on asusoled program taken from
- *  https://launchpad.net/asusoled/.
+ *  <http://lapsus.berlios.de/asus_oled.html>.
  *
  *
  */
diff --git a/drivers/staging/comedi/drivers/cb_pcimdas.c b/drivers/staging/comedi/drivers/cb_pcimdas.c
index ced346a7cae3..78b1410ba4f6 100644
--- a/drivers/staging/comedi/drivers/cb_pcimdas.c
+++ b/drivers/staging/comedi/drivers/cb_pcimdas.c
@@ -37,7 +37,7 @@ Configuration Options:
 Developed from cb_pcidas and skel by Richard Bytheway (mocelet@sucs.org).
 Only supports DIO, AO and simple AI in it's present form.
 No interrupts, multi channel or FIFO AI, although the card looks like it could support this.
-See http://www.measurementcomputing.com/PDFManuals/pcim-das1602_16.pdf for more details.
+See http://www.mccdaq.com/PDFs/Manuals/pcim-das1602-16.pdf for more details.
 */
 
 #include "../comedidev.h"
diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c
index 6af6c8323d56..82be77daa7d7 100644
--- a/drivers/staging/comedi/drivers/daqboard2000.c
+++ b/drivers/staging/comedi/drivers/daqboard2000.c
@@ -50,8 +50,8 @@ Configuration options:
    With some help from our swedish distributor, we got the Windows sourcecode
    for the card, and here are the findings so far.
 
-   1. A good document that describes the PCI interface chip is found at:
-      http://plx.plxtech.com/download/9080/databook/9080db-106.pdf
+   1. A good document that describes the PCI interface chip is 9080db-106.pdf
+      available from http://www.plxtech.com/products/io/pci9080 
 
    2. The initialization done so far is:
         a. program the FPGA (windows code sans a lot of error messages)
diff --git a/drivers/staging/comedi/drivers/ni_labpc.c b/drivers/staging/comedi/drivers/ni_labpc.c
index 3acf7e62bec4..1411dd8f4e7c 100644
--- a/drivers/staging/comedi/drivers/ni_labpc.c
+++ b/drivers/staging/comedi/drivers/ni_labpc.c
@@ -37,7 +37,7 @@ boards has not
 yet been added to the driver, mainly due to the fact that
 I don't know the device id numbers.  If you have one
 of these boards,
-please file a bug report at https://bugs.comedi.org/
+please file a bug report at http://comedi.org/ 
 so I can get the necessary information from you.
 
 The 1200 series boards have onboard calibration dacs for correcting
diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c
index bd16f913af23..986ef6712989 100644
--- a/drivers/staging/comedi/drivers/ni_mio_common.c
+++ b/drivers/staging/comedi/drivers/ni_mio_common.c
@@ -34,7 +34,7 @@
 	   340747b.pdf  AT-MIO E series Register Level Programmer Manual
 	   341079b.pdf  PCI E Series RLPM
 	   340934b.pdf  DAQ-STC reference manual
-	67xx and 611x registers (from http://www.ni.com/pdf/daq/us)
+	67xx and 611x registers (from ftp://ftp.ni.com/support/daq/mhddk/documentation/)
 	release_ni611x.pdf
 	release_ni67xx.pdf
 	Other possibly relevant info:
diff --git a/drivers/staging/comedi/drivers/plx9080.h b/drivers/staging/comedi/drivers/plx9080.h
index 485d63f99293..0d254a1b78a7 100644
--- a/drivers/staging/comedi/drivers/plx9080.h
+++ b/drivers/staging/comedi/drivers/plx9080.h
@@ -13,7 +13,7 @@
  *
  ********************************************************************
  *
- * Copyright (C) 1999 RG Studio s.c., http://www.rgstudio.com.pl/
+ * Copyright (C) 1999 RG Studio s.c.
  * Written by Krzysztof Halasa <khc@rgstudio.com.pl>
  *
  * Portions (C) SBE Inc., used by permission.
diff --git a/drivers/staging/comedi/drivers/rtd520.c b/drivers/staging/comedi/drivers/rtd520.c
index 0367d2b9e2fa..a49a7c566d37 100644
--- a/drivers/staging/comedi/drivers/rtd520.c
+++ b/drivers/staging/comedi/drivers/rtd520.c
@@ -59,7 +59,7 @@ Configuration options:
     Data sheet: http://www.rtdusa.com/pdf/dm7520.pdf
     Example source: http://www.rtdusa.com/examples/dm/dm7520.zip
     Call them and ask for the register level manual.
-    PCI chip: http://www.plxtech.com/products/toolbox/9080.htm
+    PCI chip: http://www.plxtech.com/products/io/pci9080 
 
     Notes:
     This board is memory mapped.  There is some IO stuff, but it isn't needed.
diff --git a/drivers/staging/quickstart/quickstart.c b/drivers/staging/quickstart/quickstart.c
index 66122479d529..ba8f670ec0a7 100644
--- a/drivers/staging/quickstart/quickstart.c
+++ b/drivers/staging/quickstart/quickstart.c
@@ -5,8 +5,7 @@
  *  Copyright (C) 2007-2010 Angelo Arrifano <miknix@gmail.com>
  *
  *  Information gathered from disassebled dsdt and from here:
- *  "http://download.microsoft.com/download/9/c/5/
- *  9c5b2167-8017-4bae-9fde-d599bac8184a/DirAppLaunch_Vista.doc"
+ *  <http://www.microsoft.com/whdc/system/platform/firmware/DirAppLaunch.mspx> 
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index 1da73ecd9799..bb440792a1b7 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -17,9 +17,9 @@ config UIO_CIF
 	depends on PCI
 	help
 	  Driver for Hilscher CIF DeviceNet and Profibus cards.  This
-	  driver requires a userspace component that handles all of the
-	  heavy lifting and can be found at:
-	  	http://www.osadl.org/projects/downloads/UIO/user/cif-*
+  	  driver requires a userspace component called cif that handles
+	  all of the heavy lifting and can be found at:
+	        <http://www.osadl.org/projects/downloads/UIO/user/>
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called uio_cif.
diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index 916b2b6d765f..b71e309116a3 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -176,7 +176,7 @@ config USB_SERIAL_VISOR
 	help
 	  Say Y here if you want to connect to your HandSpring Visor, Palm
 	  m500 or m505 through its USB docking station. See
-	  <http://usbvisor.sourceforge.net/> for more information on using this
+	  <http://usbvisor.sourceforge.net/index.php3> for more information on using this
 	  driver.
 
 	  To compile this driver as a module, choose M here: the
@@ -289,7 +289,7 @@ config USB_SERIAL_KEYSPAN
 	  and was developed with their support.  You must also include
 	  firmware to support your particular device(s).
 
-	  See <http://misc.nu/hugh/keyspan.html> for more information.
+	  See <http://blemings.org/hugh/keyspan.html> for more information.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called keyspan.
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 6e612c52e763..732ff2a2fa36 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -46,7 +46,7 @@
 #define FTDI_USINT_RS232_PID	0xb812	/* Navigator RS232 and CONFIG lines */
 
 /* OOCDlink by Joern Kaipf <joernk@web.de>
- * (http://www.joernonline.de/dw/doku.php?id=start&idx=projects:oocdlink) */
+ * (http://www.joernonline.de/) */
 #define FTDI_OOCDLINK_PID	0xbaf8	/* Amontec JTAGkey */
 
 /* Luminary Micro Stellaris Boards, VID = FTDI_VID */
@@ -320,7 +320,7 @@
 #define FTDI_PIEGROUP_PID	0xF208	/* Product Id */
 
 /* ACT Solutions HomePro ZWave interface
-   (http://www.act-solutions.com/HomePro.htm) */
+   (http://www.act-solutions.com/HomePro-Product-Matrix.html) */
 #define FTDI_ACTZWAVE_PID	0xF2D0
 
 /*
@@ -351,7 +351,7 @@
 #define FTDI_SUUNTO_SPORTS_PID	0xF680	/* Suunto Sports instrument */
 
 /* USB-UIRT - An infrared receiver and transmitter using the 8U232AM chip */
-/* http://home.earthlink.net/~jrhees/USBUIRT/index.htm */
+/* http://www.usbuirt.com/ */
 #define FTDI_USB_UIRT_PID	0xF850	/* Product Id */
 
 /* CCS Inc. ICDU/ICDU40 product ID -
@@ -380,7 +380,7 @@
  */
 #define FTDI_HE_TIRA1_PID	0xFA78	/* Tira-1 IR transceiver */
 
-/* Inside Accesso contactless reader (http://www.insidefr.com) */
+/* Inside Accesso contactless reader (http://www.insidecontactless.com/) */
 #define INSIDE_ACCESSO		0xFAD0
 
 /*
@@ -619,14 +619,14 @@
 
 /*
  * JETI SPECTROMETER SPECBOS 1201
- * http://www.jeti.com/products/sys/scb/scb1201.php
+ * http://www.jeti.com/cms/index.php/instruments/other-instruments/specbos-2101
  */
 #define JETI_VID		0x0c6c
 #define JETI_SPC1201_PID	0x04b2
 
 /*
  * FTDI USB UART chips used in construction projects from the
- * Elektor Electronics magazine (http://elektor-electronics.co.uk)
+ * Elektor Electronics magazine (http://www.elektor.com/)
  */
 #define ELEKTOR_VID		0x0C7D
 #define ELEKTOR_FT323R_PID	0x0005	/* RFID-Reader, issue 09-2006 */
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 297163c3c610..0791778a66f3 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -9,7 +9,7 @@
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
-  See http://misc.nu/hugh/keyspan.html for more information.
+  See http://blemings.org/hugh/keyspan.html for more information.
 
   Code in this driver inspired by and in a number of places taken
   from Brian Warner's original Keyspan-PDA driver.
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index bf3297ddd186..2d8baf6ac472 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -9,7 +9,7 @@
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
-  See http://misc.nu/hugh/keyspan.html for more information.
+  See http://blemings.org/hugh/keyspan.html for more information.
   
   Code in this driver inspired by and in a number of places taken
   from Brian Warner's original Keyspan-PDA driver.
diff --git a/drivers/usb/serial/mct_u232.h b/drivers/usb/serial/mct_u232.h
index 3a3f5e6b8f96..d325bb8cb583 100644
--- a/drivers/usb/serial/mct_u232.h
+++ b/drivers/usb/serial/mct_u232.h
@@ -10,10 +10,9 @@
  *
  * This driver is for the device MCT USB-RS232 Converter (25 pin, Model No.
  * U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
- * Model No. U232-P9). See http://www.mct.com.tw/p_u232.html for further
- * information. The properties of this device are listed at the end of this
- * file. This device is available from various distributors. I know Hana,
- * http://www.hana.de and D-Link, http://www.dlink.com/products/usb/dsbs25.
+ * Model No. U232-P9). See http://www.mct.com.tw/products/product_us232.html 
+ * for further information. The properties of this device are listed at the end 
+ * of this file. This device was used in the Dlink DSB-S25.
  *
  * All of the information about the device was acquired by using SniffUSB
  * on Windows98. The technical details of the reverse engineering are
@@ -458,7 +457,7 @@ static int mct_u232_calculate_baud_rate(struct usb_serial *serial,
  * embedded UART.  Exhaustive documentation for these is available at:
  *
  *   http://www.semiconductors.philips.com/pip/p87c52ubaa
- *   http://www.semiconductors.philips.com/pip/pdiusbd12
+ *   http://www.nxp.com/acrobat_download/various/PDIUSBD12_PROGRAMMING_GUIDE.pdf
  *
  * Thanks to Julian Highfield for the pointer to the Philips database.
  *
diff --git a/drivers/usb/storage/Kconfig b/drivers/usb/storage/Kconfig
index 8a372bac0e43..b356e1565db6 100644
--- a/drivers/usb/storage/Kconfig
+++ b/drivers/usb/storage/Kconfig
@@ -36,7 +36,7 @@ config USB_STORAGE_DATAFAB
 	depends on USB_STORAGE
 	help
 	  Support for certain Datafab CompactFlash readers.
-	  Datafab has a web page at <http://www.datafabusa.com/>.
+	  Datafab has a web page at <http://www.datafab.com/>.
 
 	  If this driver is compiled as a module, it will be named ums-datafab.
 
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index a9ca72f301bf..d4979f3f77fb 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -49,7 +49,7 @@ menuconfig FB
 	  You need an utility program called fbset to make full use of frame
 	  buffer devices. Please read <file:Documentation/fb/framebuffer.txt>
 	  and the Framebuffer-HOWTO at
-	  <http://www.munted.org.uk/programming/Framebuffer-HOWTO-1.2.html> for more
+	  <http://www.munted.org.uk/programming/Framebuffer-HOWTO-1.3.html> for more
 	  information.
 
 	  Say Y here and to the driver for your graphics board below if you
@@ -955,7 +955,7 @@ config FB_EPSON1355
 	  Build in support for the SED1355 Epson Research Embedded RAMDAC
 	  LCD/CRT Controller (since redesignated as the S1D13505) as a
 	  framebuffer.  Product specs at
-	  <http://www.erd.epson.com/vdc/html/products.htm>.
+	  <http://vdc.epson.com/>.
 
 config FB_S1D13XXX
 	tristate "Epson S1D13XXX framebuffer support"
@@ -966,7 +966,7 @@ config FB_S1D13XXX
 	help
 	  Support for S1D13XXX framebuffer device family (currently only
 	  working with S1D13806). Product specs at
-	  <http://www.erd.epson.com/vdc/html/legacy_13xxx.htm>
+	  <http://vdc.epson.com/>
 
 config FB_ATMEL
 	tristate "AT91/AT32 LCD Controller support"
@@ -1323,7 +1323,7 @@ config FB_RADEON
 	  don't need to choose this to run the Radeon in plain VGA mode.
 
 	  There is a product page at
-	  http://apps.ati.com/ATIcompare/
+	  http://products.amd.com/en-us/GraphicCardResult.aspx
 
 config FB_RADEON_I2C
 	bool "DDC/I2C for ATI Radeon support"
@@ -1395,7 +1395,7 @@ config FB_ATY_CT
 	  Say Y here to support use of ATI's 64-bit Rage boards (or other
 	  boards based on the Mach64 CT, VT, GT, and LT chipsets) as a
 	  framebuffer device.  The ATI product support page for these boards
-	  is at <http://support.ati.com/products/pc/mach64/>.
+	  is at <http://support.ati.com/products/pc/mach64/mach64.html>.
 
 config FB_ATY_GENERIC_LCD
 	bool "Mach64 generic LCD support (EXPERIMENTAL)"
diff --git a/drivers/video/arcfb.c b/drivers/video/arcfb.c
index f3d7440f0072..3ec4923c2d84 100644
--- a/drivers/video/arcfb.c
+++ b/drivers/video/arcfb.c
@@ -2,7 +2,6 @@
  * linux/drivers/video/arcfb.c -- FB driver for Arc monochrome LCD board
  *
  * Copyright (C) 2005, Jaya Kumar <jayalk@intworks.biz>
- * http://www.intworks.biz/arclcd
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License. See the file COPYING in the main directory of this archive for
diff --git a/drivers/video/epson1355fb.c b/drivers/video/epson1355fb.c
index db9713b49ce9..a268cbf1cbea 100644
--- a/drivers/video/epson1355fb.c
+++ b/drivers/video/epson1355fb.c
@@ -4,7 +4,7 @@
  * Epson Research S1D13505 Embedded RAMDAC LCD/CRT Controller
  *   (previously known as SED1355)
  *
- * Cf. http://www.erd.epson.com/vdc/html/S1D13505.html
+ * Cf. http://vdc.epson.com/
  *
  *
  * Copyright (C) Hewlett-Packard Company.  All rights reserved.
diff --git a/drivers/video/fbcvt.c b/drivers/video/fbcvt.c
index 7293eaccd81b..7cb715dfc0e1 100644
--- a/drivers/video/fbcvt.c
+++ b/drivers/video/fbcvt.c
@@ -5,7 +5,7 @@
  *
  *      Based from the VESA(TM) Coordinated Video Timing Generator by
  *      Graham Loveridge April 9, 2003 available at
- *      http://www.vesa.org/public/CVT/CVTd6r1.xls
+ *      http://www.elo.utfsm.cl/~elo212/docs/CVTd6r1.xls
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file COPYING in the main directory of this archive
diff --git a/drivers/video/metronomefb.c b/drivers/video/metronomefb.c
index 9b3d6e4584cc..63ed3b72b01c 100644
--- a/drivers/video/metronomefb.c
+++ b/drivers/video/metronomefb.c
@@ -10,7 +10,7 @@
  * Layout is based on skeletonfb.c by James Simmons and Geert Uytterhoeven.
  *
  * This work was made possible by help and equipment support from E-Ink
- * Corporation. http://support.eink.com/community
+ * Corporation. http://www.eink.com/
  *
  * This driver is written to be used with the Metronome display controller.
  * It is intended to be architecture independent. A board specific driver
diff --git a/firmware/keyspan_pda/keyspan_pda.S b/firmware/keyspan_pda/keyspan_pda.S
index 418fe69aa5e0..f3acc197a5ef 100644
--- a/firmware/keyspan_pda/keyspan_pda.S
+++ b/firmware/keyspan_pda/keyspan_pda.S
@@ -74,7 +74,7 @@
  *  recognizes the new device ID and glues it to the real serial driver code.
  *
  * USEFUL DOCS:
- *  EzUSB Technical Reference Manual: <http://www.anchorchips.com>
+ *  EzUSB Technical Reference Manual: <http://www.cypress.com/>
  *  8051 manuals: everywhere, but try www.dalsemi.com because the EzUSB is
  *   basically the Dallas enhanced 8051 code. Remember that the EzUSB IO ports
  *   use totally different registers!
diff --git a/firmware/keyspan_pda/xircom_pgs.S b/firmware/keyspan_pda/xircom_pgs.S
index 05d99dd63776..0b79bbf0ae15 100644
--- a/firmware/keyspan_pda/xircom_pgs.S
+++ b/firmware/keyspan_pda/xircom_pgs.S
@@ -74,7 +74,7 @@
  *  recognizes the new device ID and glues it to the real serial driver code.
  *
  * USEFUL DOCS:
- *  EzUSB Technical Reference Manual: <http://www.anchorchips.com>
+ *  EzUSB Technical Reference Manual: <http://www.cypress.com/>
  *  8051 manuals: everywhere, but try www.dalsemi.com because the EzUSB is
  *   basically the Dallas enhanced 8051 code. Remember that the EzUSB IO ports
  *   use totally different registers!
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6bbd75c5589b..7c232c1487ee 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
  * #define ATTR_KILL_SUID	2048
  * #define ATTR_KILL_SGID	4096
  *
- * and this is because they were added in 2.5 development in this patch:
- *
- * http://linux.bkbits.net:8080/linux-2.5/
- * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
- * |src/.|src/include|src/include/linux|related/include/linux/fs.h
- *
+ * and this is because they were added in 2.5 development.
  * Actually, they are not needed by most ->setattr() methods - they are set by
  * callers of notify_change() to notify that the setuid/setgid bits must be
  * dropped.
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 648c9d8f3357..e55fefcb0dcc 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
  *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
  *
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License as published by the Free Software
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d86..374242c0971a 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
  *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f9..7cd46666ba2c 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
 
 	  In general, ReiserFS is as fast as ext2, but is very efficient with
 	  large directories and small files.  Additional patches are needed
-	  for NFS and quotas, please see <http://www.namesys.com/> for links.
+	  for NFS and quotas, please see 
+	  <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
 
 	  It is more easily extended to have features currently found in
 	  database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
 	  plugins consistent with our motto ``It takes more than a license to
 	  make source code open.''
 
-	  Read <http://www.namesys.com/> to learn more about reiserfs.
+	  Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
+	  to learn more about reiserfs.
 
 	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
 
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e5..e2f7a264e3ff 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
 [END LICENSING]
 
 Reiserfs is a file system based on balanced tree algorithms, which is
-described at http://devlinux.com/namesys.
+described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
 
 Stop reading here.  Go there, then return.
 
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 4086b8ebfafe..da2530e34b26 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -54,8 +54,8 @@
 
 /* Comment by Rik:
  *
- * For some background on GF(2^128) see for example: http://-
- * csrc.nist.gov/CryptoToolkit/modes/proposedmodes/gcm/gcm-revised-spec.pdf
+ * For some background on GF(2^128) see for example: 
+ * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf 
  *
  * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can
  * be mapped to computer memory in a variety of ways. Let's examine
diff --git a/include/linux/fdreg.h b/include/linux/fdreg.h
index c2eeb63b72db..61ce64169004 100644
--- a/include/linux/fdreg.h
+++ b/include/linux/fdreg.h
@@ -89,7 +89,7 @@
 /* the following commands are new in the 82078. They are not used in the
  * floppy driver, except the first three. These commands may be useful for apps
  * which use the FDRAWCMD interface. For doc, get the 82078 spec sheets at
- * http://www-techdoc.intel.com/docs/periph/fd_contr/datasheets/ */
+ * http://www.intel.com/design/archives/periphrl/docs/29046803.htm */
 
 #define FD_PARTID		0x18	/* part id ("extended" version cmd) */
 #define FD_SAVE			0x2e	/* save fdc regs for later restore */
diff --git a/include/linux/if_infiniband.h b/include/linux/if_infiniband.h
index 3e659ec7dfdd..7d958475d4ac 100644
--- a/include/linux/if_infiniband.h
+++ b/include/linux/if_infiniband.h
@@ -5,7 +5,7 @@
  * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
  * license, available in the LICENSE.TXT file accompanying this
  * software.  These details are also available at
- * <http://openib.org/license.html>.
+ * <http://www.openfabrics.org/software_license.htm>.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
diff --git a/include/linux/n_r3964.h b/include/linux/n_r3964.h
index de24af79ebd3..54b8e0d8d916 100644
--- a/include/linux/n_r3964.h
+++ b/include/linux/n_r3964.h
@@ -4,7 +4,6 @@
  * Copyright by
  * Philips Automation Projects
  * Kassel (Germany)
- * http://www.pap-philips.de
  * -----------------------------------------------------------
  * This software may be used and distributed according to the terms of
  * the GNU General Public License, incorporated herein by reference.
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index 2a72aa96a568..705e53ef4af0 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -7,7 +7,7 @@ menuconfig HAMRADIO
 	bool "Amateur Radio support"
 	help
 	  If you want to connect your Linux box to an amateur radio, answer Y
-	  here. You want to read <http://www.tapr.org/tapr/html/pkthome.html>
+	  here. You want to read <http://www.tapr.org/>
 	  and more specifically about AX.25 on Linux
 	  <http://www.linux-ax25.org/>.
 
@@ -42,7 +42,7 @@ config AX25
 	  check out the file <file:Documentation/networking/ax25.txt> in the
 	  kernel source. More information about digital amateur radio in
 	  general is on the WWW at
-	  <http://www.tapr.org/tapr/html/pkthome.html>.
+	  <http://www.tapr.org/>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called ax25.
@@ -89,7 +89,7 @@ config NETROM
 	  <http://www.linux-ax25.org>. You also might want to check out the
 	  file <file:Documentation/networking/ax25.txt>. More information about
 	  digital amateur radio in general is on the WWW at
-	  <http://www.tapr.org/tapr/html/pkthome.html>.
+	  <http://www.tapr.org/>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called netrom.
@@ -108,7 +108,7 @@ config ROSE
 	  <http://www.linux-ax25.org>.  You also might want to check out the
 	  file <file:Documentation/networking/ax25.txt>. More information about
 	  digital amateur radio in general is on the WWW at
-	  <http://www.tapr.org/tapr/html/pkthome.html>.
+	  <http://www.tapr.org/>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rose.
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d191249..1cc7ef270d54 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -84,7 +84,7 @@ config IP_FIB_TRIE
 
 	  An experimental study of compression methods for dynamic tries
 	  Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
-	  http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+	  <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
 
 endchoice
 
@@ -555,7 +555,7 @@ config TCP_CONG_VENO
 	distinguishing to circumvent the difficult judgment of the packet loss
 	type. TCP Veno cuts down less congestion window in response to random
 	loss packets.
-	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+	See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 
 
 config TCP_CONG_YEAH
 	tristate "YeAH TCP"
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 3a92a76ae41d..094e150c6260 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
  *
  * The CIPSO draft specification can be found in the kernel's Documentation
  * directory as well as the following URL:
- *   http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ *   http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
  * The FIPS-188 specification can be found at the following URL:
  *   http://www.itl.nist.gov/fipspubs/fip188.htm
  *
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 79d057a939ba..2230ae3bf20e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -16,7 +16,7 @@
  *
  * An experimental study of compression methods for dynamic tries
  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  *
  *
  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..d048275a62cb 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -147,7 +147,7 @@ config IP_NF_TARGET_ULOG
 	  which can only be viewed through syslog.
 
 	  The appropriate userspace logging daemon (ulogd) may be obtained from
-	  <http://www.gnumonks.org/projects/ulogd/>
+	  <http://www.netfilter.org/projects/ulogd/index.html>
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72dc..00ca688d8964 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
  * The algorithm is described in:
  * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
  *  for High-Speed Networks"
- * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
  *
  * Implemented from description in paper and ns-2 simulation.
  * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e663b78a2ef6..bccce3424a63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -428,10 +428,10 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
  *
  * The algorithm for RTT estimation w/o timestamps is based on
  * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
  *
  * More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
  * though this reference is out of date.  A new paper
  * is pending.
  */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index b612acf76183..38bc0b52d745 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
  *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
  *    IEEE Journal on Selected Areas in Communication,
  *    Feb. 2003.
- * 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ * 	See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
  */
 
 #include <linux/mm.h>
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index c4c885dca3bd..3fb2b73b24dc 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -329,8 +329,8 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
    in IP Filter' by Guido van Rooij.
 
-   http://www.nluug.nl/events/sane2000/papers.html
-   http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+   http://www.sane.nl/events/sane2000/papers.html
+   http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
 
    The boundaries and the conditions are changed according to RFC793:
    the packet must intersect the window (i.e. segments may be
diff --git a/sound/oss/ac97_codec.c b/sound/oss/ac97_codec.c
index 456a1b4d7832..854c303264dc 100644
--- a/sound/oss/ac97_codec.c
+++ b/sound/oss/ac97_codec.c
@@ -21,11 +21,8 @@
  *
  **************************************************************************
  *
- * The Intel Audio Codec '97 specification is available at the Intel
- * audio homepage: http://developer.intel.com/ial/scalableplatforms/audio/
- *
- * The specification itself is currently available at:
- * ftp://download.intel.com/ial/scalableplatforms/ac97r22.pdf
+ * The Intel Audio Codec '97 specification is available at:
+ * http://download.intel.com/support/motherboards/desktop/sb/ac97_r23.pdf
  *
  **************************************************************************
  *
diff --git a/sound/pci/ens1370.c b/sound/pci/ens1370.c
index c7fba5379813..537cfba829a5 100644
--- a/sound/pci/ens1370.c
+++ b/sound/pci/ens1370.c
@@ -22,7 +22,7 @@
 /* Power-Management-Code ( CONFIG_PM )
  * for ens1371 only ( FIXME )
  * derived from cs4281.c, atiixp.c and via82xx.c
- * using http://www.alsa-project.org/~iwai/writing-an-alsa-driver/c1540.htm
+ * using http://www.alsa-project.org/~tiwai/writing-an-alsa-driver/ 
  * by Kurt J. Bosch
  */
 
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index 6433e65c9507..a2999d678918 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -716,7 +716,7 @@ static void snd_intel8x0_setup_periods(struct intel8x0 *chip, struct ichdev *ich
  * Intel 82443MX running a 100MHz processor system bus has a hardware bug,
  * which aborts PCI busmaster for audio transfer.  A workaround is to set
  * the pages as non-cached.  For details, see the errata in
- *	http://www.intel.com/design/chipsets/specupdt/245051.htm
+ *	http://download.intel.com/design/chipsets/specupdt/24505108.pdf
  */
 static void fill_nocache(void *buf, int size, int nocache)
 {
-- 
cgit v1.2.3


From 33027af637da3f69bd17488cc3e68493c9052a7d Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 16 Oct 2010 15:19:22 +0200
Subject: GFS2: fixed typo

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..db1c26d6d220 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
  * @gl: the glock
  * @state: the state we're requesting
  * @flags: the modifier flags
-- 
cgit v1.2.3


From 7aebf4106b4263667696485303af498aa1eea9ef Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <Boaz Harrosh bharrosh@panasas.com>
Date: Wed, 13 Oct 2010 12:55:43 -0400
Subject: exofs: Cleaup read path in regard with read_for_write

Last BUG fix added a flag to the the page_collect structure
to communicate with readpage_strip. This calls for a clean up
removing that flag's reincarnations in the read functions
parameters.

Signed-off-by: Boaz Harrosh <Boaz Harrosh bharrosh@panasas.com>
---
 fs/exofs/inode.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3eadd97324b1..24ab327a20cf 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
  * status.
  */
-static int __readpages_done(struct page_collect *pcol, bool do_unlock)
+static int __readpages_done(struct page_collect *pcol)
 {
 	int i;
 	u64 resid;
@@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 			  page_stat ? "bad_bytes" : "good_bytes");
 
 		ret = update_read_page(page, page_stat);
-		if (do_unlock)
+		if (!pcol->read_4_write)
 			unlock_page(page);
 		length += PAGE_SIZE;
 	}
@@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 
-	__readpages_done(pcol, true);
+	__readpages_done(pcol);
 	atomic_dec(&pcol->sbi->s_curr_pending);
 	kfree(pcol);
 }
@@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 	}
 }
 
-static int read_exec(struct page_collect *pcol, bool is_sync)
+static int read_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
 	struct exofs_io_state *ios = pcol->ios;
@@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	if (!pcol->pages)
 		return 0;
 
-	/* see comment in _readpage() about sync reads */
-	WARN_ON(is_sync && (pcol->nr_pages != 1));
-
 	ios->pages = pcol->pages;
 	ios->nr_pages = pcol->nr_pages;
 	ios->length = pcol->length;
 	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
-	if (is_sync) {
+	if (pcol->read_4_write) {
 		exofs_oi_read(oi, pcol->ios);
-		return __readpages_done(pcol, false);
+		return __readpages_done(pcol);
 	}
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	return 0;
 
 err:
-	if (!is_sync)
+	if (!pcol->read_4_write)
 		_unlock_pcol_pages(pcol, ret, READ);
 
 	pcol_free(pcol);
@@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page)
 		EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
 			     " splitting\n", inode->i_ino, page->index);
 
-		return read_exec(pcol, false);
+		return read_exec(pcol);
 	}
 
 try_again:
@@ -366,7 +363,7 @@ try_again:
 	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
 		   page->index)) {
 		/* Discontinuity detected, split the request */
-		ret = read_exec(pcol, false);
+		ret = read_exec(pcol);
 		if (unlikely(ret))
 			goto fail;
 		goto try_again;
@@ -391,7 +388,7 @@ try_again:
 			  page, len, pcol->nr_pages, pcol->length);
 
 		/* split the request, and start again with current page */
-		ret = read_exec(pcol, false);
+		ret = read_exec(pcol);
 		if (unlikely(ret))
 			goto fail;
 
@@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
-	return read_exec(&pcol, false);
+	return read_exec(&pcol);
 }
 
-static int _readpage(struct page *page, bool is_sync)
+static int _readpage(struct page *page, bool read_4_write)
 {
 	struct page_collect pcol;
 	int ret;
 
 	_pcol_init(&pcol, 1, page->mapping->host);
 
-	/* readpage_strip might call read_exec(,is_sync==false) at several
-	 * places but not if we have a single page.
-	 */
-	pcol.read_4_write = is_sync;
+	pcol.read_4_write = read_4_write;
 	ret = readpage_strip(&pcol, page);
 	if (ret) {
 		EXOFS_ERR("_readpage => %d\n", ret);
 		return ret;
 	}
 
-	return read_exec(&pcol, is_sync);
+	return read_exec(&pcol);
 }
 
 /*
-- 
cgit v1.2.3


From 115e19c53501edc11f730191f7f047736815ae3d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <Boaz Harrosh bharrosh@panasas.com>
Date: Thu, 7 Oct 2010 14:28:18 -0400
Subject: exofs: Set i_mapping->backing_dev_info anyway

Though it has been promised that inode->i_mapping->backing_dev_info
is not used and the supporting code is fine. Until the pointer
will default to NULL, I'd rather it points to the correct thing
regardless.

At least for future infrastructure coder it is a clear indication
of where are the key points that inodes are initialized.
I know because it took me time to find this out.

Signed-off-by: Boaz Harrosh <Boaz Harrosh bharrosh@panasas.com>
---
 fs/exofs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 24ab327a20cf..0ba9886da2ec 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1030,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1129,6 +1130,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	sbi = sb->s_fs_info;
 
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
-- 
cgit v1.2.3


From 3ae4c9deb30a8d5ee305b461625dcb298c9804a9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 12:01:50 +1000
Subject: xfs: use range primitives for xfs page cache operations

While XFS passes ranges to operate on from the core code, the
functions being called ignore the either the entire range or the end
of the range. This is historical because when the function were
written linux didn't have the necessary range operations. Update the
functions to use the correct operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_fs_subr.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	struct address_space *mapping = VFS_I(ip)->i_mapping;
-
-	if (mapping->nrpages)
-		truncate_inode_pages(mapping, first);
+	/* can't toss partial tail pages, so mask them out */
+	last &= ~(PAGE_SIZE - 1);
+	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
 }
 
 int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
 
 	trace_xfs_pagecache_inval(ip, first, last);
 
-	if (mapping->nrpages) {
-		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_write_and_wait(mapping);
-		if (!ret)
-			truncate_inode_pages(mapping, first);
-	}
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	ret = filemap_write_and_wait_range(mapping, first,
+				last == -1 ? LLONG_MAX : last);
+	if (!ret)
+		truncate_inode_pages_range(mapping, first, last);
 	return -ret;
 }
 
@@ -71,10 +69,9 @@ xfs_flush_pages(
 	int		ret = 0;
 	int		ret2;
 
-	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = -filemap_fdatawrite(mapping);
-	}
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	ret = -filemap_fdatawrite_range(mapping, first,
+				last == -1 ? LLONG_MAX : last);
 	if (flags & XBF_ASYNC)
 		return ret;
 	ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
 {
 	struct address_space *mapping = VFS_I(ip)->i_mapping;
 
-	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
-		return -filemap_fdatawait(mapping);
+	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+		return -filemap_fdatawait_range(mapping, first,
+					last == -1 ? ip->i_size - 1 : last);
+	}
 	return 0;
 }
-- 
cgit v1.2.3


From 447223520520b17d3b6d0631aa4838fbaf8eddb4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 12:02:11 +1000
Subject: xfs: Introduce XFS_IOC_ZERO_RANGE

XFS_IOC_ZERO_RANGE is the equivalent of an atomic XFS_IOC_UNRESVSP/
XFS_IOC_RESVSP call pair. It enabled ranges of written data to be
turned into zeroes without requiring IO or having to free and
reallocate the extents in the range given as would occur if we had
to punch and then preallocate them separately.  This enables
applications to zero parts of files very quickly without changing
the layout of the files in any way.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_ioctl.c   |  3 ++-
 fs/xfs/linux-2.6/xfs_ioctl32.c |  1 +
 fs/xfs/xfs_bmap.c              | 12 +++++++++---
 fs/xfs/xfs_bmap.h              |  9 ++++++---
 fs/xfs/xfs_fs.h                |  1 +
 fs/xfs/xfs_vnodeops.c          | 10 ++++++++--
 6 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..03aa908a9cb9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1301,7 +1301,8 @@ xfs_file_ioctl(
 	case XFS_IOC_ALLOCSP64:
 	case XFS_IOC_FREESP64:
 	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64: {
+	case XFS_IOC_UNRESVSP64:
+	case XFS_IOC_ZERO_RANGE: {
 		xfs_flock64_t		bf;
 
 		if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..464bcc76e980 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -574,6 +574,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_FSGEOMETRY_V1:
 	case XFS_IOC_FSGROWFSDATA:
 	case XFS_IOC_FSGROWFSRT:
+	case XFS_IOC_ZERO_RANGE:
 		return xfs_file_ioctl(filp, cmd, p);
 #else
 	case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..5e33b7862d41 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4744,8 +4744,12 @@ xfs_bmapi(
 		 * Check if writing previously allocated but
 		 * unwritten extents.
 		 */
-		if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
-		    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+		if (wr &&
+		    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+		     (mval->br_state == XFS_EXT_NORM &&
+		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+				(XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
 			/*
 			 * Modify (by adding) the state flag, if writing.
 			 */
@@ -4757,7 +4761,9 @@ xfs_bmapi(
 					*firstblock;
 				cur->bc_private.b.flist = flist;
 			}
-			mval->br_state = XFS_EXT_NORM;
+			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+						? XFS_EXT_NORM
+						: XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
 				firstblock, flist, &tmp_logflags,
 				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..71ec9b6ecdfc 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -74,9 +74,12 @@ typedef	struct xfs_bmap_free
 #define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
 					/* combine contig. space */
 #define	XFS_BMAPI_CONTIG	0x100	/* must allocate only one extent */
-#define XFS_BMAPI_CONVERT	0x200	/* unwritten extent conversion - */
-					/* need write cache flushing and no */
-					/* additional allocation alignments */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT	0x200
 
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_WRITE,	"WRITE" }, \
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..10f6093671b0 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -448,6 +448,7 @@ typedef struct xfs_handle {
 /*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
 /*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..dc6e4fb8bbc9 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2272,7 +2272,7 @@ xfs_alloc_file_space(
 	count = len;
 	imapp = &imaps[0];
 	nimaps = 1;
-	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+	bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
@@ -2711,6 +2711,7 @@ xfs_change_file_space(
 	xfs_off_t	llen;
 	xfs_trans_t	*tp;
 	struct iattr	iattr;
+	int		prealloc_type;
 
 	if (!S_ISREG(ip->i_d.di_mode))
 		return XFS_ERROR(EINVAL);
@@ -2753,12 +2754,17 @@ xfs_change_file_space(
 	 * size to be changed.
 	 */
 	setprealloc = clrprealloc = 0;
+	prealloc_type = XFS_BMAPI_PREALLOC;
 
 	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+		prealloc_type |= XFS_BMAPI_CONVERT;
+		xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+		/* FALLTHRU */
 	case XFS_IOC_RESVSP:
 	case XFS_IOC_RESVSP64:
 		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-								1, attr_flags);
+						prealloc_type, attr_flags);
 		if (error)
 			return error;
 		setprealloc = 1;
-- 
cgit v1.2.3


From 52fda114249578311776b25da9f73a9c34f4fd8c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 Sep 2010 01:44:22 +0000
Subject: xfs: simplify xfs_qm_dqusage_adjust

There is no need to have the users and group/project quota locked at the
same time.  Get rid of xfs_qm_dqget_noattach and just do a xfs_qm_dqget
inside xfs_qm_quotacheck_dqadjust for the quota we are operating on
right now.  The new version of xfs_qm_quotacheck_dqadjust holds the
inode lock over it's operations, which is not a problem as it simply
increments counters and there is no concern about log contention
during mount time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 203 +++++++++++++++-----------------------------------
 1 file changed, 61 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..121f632213a7 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1199,87 +1199,6 @@ xfs_qm_list_destroy(
 	mutex_destroy(&(list->qh_lock));
 }
 
-
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
-	xfs_inode_t	*ip,
-	xfs_dquot_t	**O_udqpp,
-	xfs_dquot_t	**O_gdqpp)
-{
-	int		error;
-	xfs_mount_t	*mp;
-	xfs_dquot_t	*udqp, *gdqp;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	mp = ip->i_mount;
-	udqp = NULL;
-	gdqp = NULL;
-
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(ip->i_udquot == NULL);
-		/*
-		 * We want the dquot allocated if it doesn't exist.
-		 */
-		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
-					 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
-					 &udqp))) {
-			/*
-			 * Shouldn't be able to turn off quotas here.
-			 */
-			ASSERT(error != ESRCH);
-			ASSERT(error != ENOENT);
-			return error;
-		}
-		ASSERT(udqp);
-	}
-
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(ip->i_gdquot == NULL);
-		if (udqp)
-			xfs_dqunlock(udqp);
-		error = XFS_IS_GQUOTA_ON(mp) ?
-				xfs_qm_dqget(mp, ip,
-					     ip->i_d.di_gid, XFS_DQ_GROUP,
-					     XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-					     &gdqp) :
-				xfs_qm_dqget(mp, ip,
-					     ip->i_d.di_projid, XFS_DQ_PROJ,
-					     XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-					     &gdqp);
-		if (error) {
-			if (udqp)
-				xfs_qm_dqrele(udqp);
-			ASSERT(error != ESRCH);
-			ASSERT(error != ENOENT);
-			return error;
-		}
-		ASSERT(gdqp);
-
-		/* Reacquire the locks in the right order */
-		if (udqp) {
-			if (! xfs_qm_dqlock_nowait(udqp)) {
-				xfs_dqunlock(gdqp);
-				xfs_dqlock(udqp);
-				xfs_dqlock(gdqp);
-			}
-		}
-	}
-
-	*O_udqpp = udqp;
-	*O_gdqpp = gdqp;
-
-#ifdef QUOTADEBUG
-	if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
-	if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
-	return 0;
-}
-
 /*
  * Create an inode and return with a reference already taken, but unlocked
  * This is how we create quota inodes
@@ -1546,18 +1465,34 @@ xfs_qm_dqiterate(
 
 /*
  * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
- * this updates its incore copy as well as the buffer copy. This is
- * so that once the quotacheck is done, we can just log all the buffers,
- * as opposed to logging numerous updates to individual dquots.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
  */
-STATIC void
+STATIC int
 xfs_qm_quotacheck_dqadjust(
-	xfs_dquot_t		*dqp,
+	struct xfs_inode	*ip,
+	xfs_dqid_t		id,
+	uint			type,
 	xfs_qcnt_t		nblks,
 	xfs_qcnt_t		rtblks)
 {
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	error = xfs_qm_dqget(mp, ip, id, type,
+			     XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+	if (error) {
+		/*
+		 * Shouldn't be able to turn off quotas here.
+		 */
+		ASSERT(error != ESRCH);
+		ASSERT(error != ENOENT);
+		return error;
+	}
 
 	trace_xfs_dqadjust(dqp);
 
@@ -1582,11 +1517,13 @@ xfs_qm_quotacheck_dqadjust(
 	 * There are no timers for the default values set in the root dquot.
 	 */
 	if (dqp->q_core.d_id) {
-		xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
-		xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+		xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+		xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
 	}
 
 	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_qm_dqput(dqp);
+	return 0;
 }
 
 STATIC int
@@ -1629,8 +1566,7 @@ xfs_qm_dqusage_adjust(
 	int		*res)		/* result code value */
 {
 	xfs_inode_t	*ip;
-	xfs_dquot_t	*udqp, *gdqp;
-	xfs_qcnt_t	nblks, rtblks;
+	xfs_qcnt_t	nblks, rtblks = 0;
 	int		error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1586,24 @@ xfs_qm_dqusage_adjust(
 	 * the case in all other instances. It's OK that we do this because
 	 * quotacheck is done only at mount time.
 	 */
-	if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
+	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+	if (error) {
 		*res = BULKSTAT_RV_NOTHING;
 		return error;
 	}
 
-	/*
-	 * Obtain the locked dquots. In case of an error (eg. allocation
-	 * fails for ENOSPC), we return the negative of the error number
-	 * to bulkstat, so that it can get propagated to quotacheck() and
-	 * making us disable quotas for the file system.
-	 */
-	if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		IRELE(ip);
-		*res = BULKSTAT_RV_GIVEUP;
-		return error;
-	}
+	ASSERT(ip->i_delayed_blks == 0);
 
-	rtblks = 0;
-	if (! XFS_IS_REALTIME_INODE(ip)) {
-		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
-	} else {
+	if (XFS_IS_REALTIME_INODE(ip)) {
 		/*
 		 * Walk thru the extent list and count the realtime blocks.
 		 */
-		if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			IRELE(ip);
-			if (udqp)
-				xfs_qm_dqput(udqp);
-			if (gdqp)
-				xfs_qm_dqput(gdqp);
-			*res = BULKSTAT_RV_GIVEUP;
-			return error;
-		}
-		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+		error = xfs_qm_get_rtblks(ip, &rtblks);
+		if (error)
+			goto error0;
 	}
-	ASSERT(ip->i_delayed_blks == 0);
 
-	/*
-	 * We can't release the inode while holding its dquot locks.
-	 * The inode can go into inactive and might try to acquire the dquotlocks.
-	 * So, just unlock here and do a vn_rele at the end.
-	 */
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
 
 	/*
 	 * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1618,36 @@ xfs_qm_dqusage_adjust(
 	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
 	 */
 	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(udqp);
-		xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
-		xfs_qm_dqput(udqp);
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
+						   XFS_DQ_USER, nblks, rtblks);
+		if (error)
+			goto error0;
 	}
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(gdqp);
-		xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
-		xfs_qm_dqput(gdqp);
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
+						   XFS_DQ_GROUP, nblks, rtblks);
+		if (error)
+			goto error0;
 	}
-	/*
-	 * Now release the inode. This will send it to 'inactive', and
-	 * possibly even free blocks.
-	 */
-	IRELE(ip);
 
-	/*
-	 * Goto next inode.
-	 */
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_projid,
+						   XFS_DQ_PROJ, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
 	*res = BULKSTAT_RV_DIDONE;
 	return 0;
+
+error0:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+	*res = BULKSTAT_RV_GIVEUP;
+	return error;
 }
 
 /*
-- 
cgit v1.2.3


From acecf1b5d8a846bf818bf74df454330f0b444b0a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 6 Sep 2010 01:44:45 +0000
Subject: xfs: stop using xfs_qm_dqtobp in xfs_qm_dqflush

In xfs_qm_dqflush we know that q_blkno must be initialized already from a
previous xfs_qm_dqread.  So instead of calling xfs_qm_dqtobp we can
simply read the quota buffer directly.  This also saves us from a duplicate
xfs_qm_dqcheck call check and allows xfs_qm_dqtobp to be simplified now
that it is always called for a newly initialized inode.  In addition to
that properly unwind all locks in xfs_qm_dqflush when xfs_qm_dqcheck
fails.

This mirrors a similar cleanup in the inode lookup done earlier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_dquot.c | 164 ++++++++++++++++++++++-------------------------
 1 file changed, 76 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..faf8e1a83a12 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -463,87 +463,68 @@ xfs_qm_dqtobp(
 	uint			flags)
 {
 	xfs_bmbt_irec_t map;
-	int		nmaps, error;
+	int		nmaps = 1, error;
 	xfs_buf_t	*bp;
-	xfs_inode_t	*quotip;
-	xfs_mount_t	*mp;
+	xfs_inode_t	*quotip = XFS_DQ_TO_QIP(dqp);
+	xfs_mount_t	*mp = dqp->q_mount;
 	xfs_disk_dquot_t *ddq;
-	xfs_dqid_t	id;
-	boolean_t	newdquot;
+	xfs_dqid_t	id = be32_to_cpu(dqp->q_core.d_id);
 	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
 
-	mp = dqp->q_mount;
-	id = be32_to_cpu(dqp->q_core.d_id);
-	nmaps = 1;
-	newdquot = B_FALSE;
+	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
-	/*
-	 * If we don't know where the dquot lives, find out.
-	 */
-	if (dqp->q_blkno == (xfs_daddr_t) 0) {
-		/* We use the id as an index */
-		dqp->q_fileoffset = (xfs_fileoff_t)id /
-					mp->m_quotainfo->qi_dqperchunk;
-		nmaps = 1;
-		quotip = XFS_DQ_TO_QIP(dqp);
-		xfs_ilock(quotip, XFS_ILOCK_SHARED);
+	xfs_ilock(quotip, XFS_ILOCK_SHARED);
+	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
 		/*
-		 * Return if this type of quotas is turned off while we didn't
-		 * have an inode lock
+		 * Return if this type of quotas is turned off while we
+		 * didn't have the quota inode lock.
 		 */
-		if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-			xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-			return (ESRCH);
-		}
+		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		return ESRCH;
+	}
+
+	/*
+	 * Find the block map; no allocations yet
+	 */
+	error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+			  XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+			  NULL, 0, &map, &nmaps, NULL);
+
+	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
+	ASSERT(nmaps == 1);
+	ASSERT(map.br_blockcount == 1);
+
+	/*
+	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 */
+	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+		sizeof(xfs_dqblk_t);
+
+	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+	if (map.br_startblock == HOLESTARTBLOCK) {
 		/*
-		 * Find the block map; no allocations yet
+		 * We don't allocate unless we're asked to
 		 */
-		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
-				  XFS_DQUOT_CLUSTER_SIZE_FSB,
-				  XFS_BMAPI_METADATA,
-				  NULL, 0, &map, &nmaps, NULL);
+		if (!(flags & XFS_QMOPT_DQALLOC))
+			return ENOENT;
 
-		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		ASSERT(tp);
+		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+					dqp->q_fileoffset, &bp);
 		if (error)
-			return (error);
-		ASSERT(nmaps == 1);
-		ASSERT(map.br_blockcount == 1);
+			return error;
+		tp = *tpp;
+	} else {
+		trace_xfs_dqtobp_read(dqp);
 
 		/*
-		 * offset of dquot in the (fixed sized) dquot chunk.
+		 * store the blkno etc so that we don't have to do the
+		 * mapping all the time
 		 */
-		dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
-			sizeof(xfs_dqblk_t);
-		if (map.br_startblock == HOLESTARTBLOCK) {
-			/*
-			 * We don't allocate unless we're asked to
-			 */
-			if (!(flags & XFS_QMOPT_DQALLOC))
-				return (ENOENT);
-
-			ASSERT(tp);
-			if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-						dqp->q_fileoffset, &bp)))
-				return (error);
-			tp = *tpp;
-			newdquot = B_TRUE;
-		} else {
-			/*
-			 * store the blkno etc so that we don't have to do the
-			 * mapping all the time
-			 */
-			dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-		}
-	}
-	ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
-	ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-
-	/*
-	 * Read in the buffer, unless we've just done the allocation
-	 * (in which case we already have the buf).
-	 */
-	if (!newdquot) {
-		trace_xfs_dqtobp_read(dqp);
+		dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
 
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
@@ -552,13 +533,14 @@ xfs_qm_dqtobp(
 		if (error || !bp)
 			return XFS_ERROR(error);
 	}
+
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 
 	/*
 	 * calculate the location of the dquot inside the buffer.
 	 */
-	ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+	ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
 
 	/*
 	 * A simple sanity check in case we got a corrupted dquot...
@@ -1176,18 +1158,18 @@ xfs_qm_dqflush(
 	xfs_dquot_t		*dqp,
 	uint			flags)
 {
-	xfs_mount_t		*mp;
-	xfs_buf_t		*bp;
-	xfs_disk_dquot_t	*ddqp;
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_buf		*bp;
+	struct xfs_disk_dquot	*ddqp;
 	int			error;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	ASSERT(!completion_done(&dqp->q_flush));
+
 	trace_xfs_dqflush(dqp);
 
 	/*
-	 * If not dirty, or it's pinned and we are not supposed to
-	 * block, nada.
+	 * If not dirty, or it's pinned and we are not supposed to block, nada.
 	 */
 	if (!XFS_DQ_IS_DIRTY(dqp) ||
 	    (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1183,46 @@ xfs_qm_dqflush(
 	 * down forcibly. If that's the case we must not write this dquot
 	 * to disk, because the log record didn't make it to disk!
 	 */
-	if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
-		dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		dqp->dq_flags &= ~XFS_DQ_DIRTY;
 		xfs_dqfunlock(dqp);
 		return XFS_ERROR(EIO);
 	}
 
 	/*
 	 * Get the buffer containing the on-disk dquot
-	 * We don't need a transaction envelope because we know that the
-	 * the ondisk-dquot has already been allocated for.
 	 */
-	if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+	if (error) {
 		ASSERT(error != ENOENT);
-		/*
-		 * Quotas could have gotten turned off (ESRCH)
-		 */
 		xfs_dqfunlock(dqp);
-		return (error);
+		return error;
 	}
 
-	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
-			   0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
-		xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
+	/*
+	 * Calculate the location of the dquot inside the buffer.
+	 */
+	ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot..
+	 */
+	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+			   XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+		xfs_buf_relse(bp);
+		xfs_dqfunlock(dqp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		return XFS_ERROR(EIO);
 	}
 
 	/* This is the only portion of data that needs to persist */
-	memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
+	memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
 
 	/*
 	 * Clear the dirty field and remember the flush lsn for later use.
 	 */
-	dqp->dq_flags &= ~(XFS_DQ_DIRTY);
-	mp = dqp->q_mount;
+	dqp->dq_flags &= ~XFS_DQ_DIRTY;
 
 	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
 					&dqp->q_logitem.qli_item.li_lsn);
-- 
cgit v1.2.3


From c0e59e1ac0a106bbab93404024bb6e7927ad9d6d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 7 Sep 2010 23:34:07 +0000
Subject: xfs: remove the ->kill_root btree operation

The implementation os ->kill_root only differ by either simply
zeroing out the now unused buffer in the btree cursor in the inode
allocation btree or using xfs_btree_setbuf in the allocation btree.

Initially both of them used xfs_btree_setbuf, but the use in the
ialloc btree was removed early on because it interacted badly with
xfs_trans_binval.

In addition to zeroing out the buffer in the cursor xfs_btree_setbuf
updates the bc_ra array in the btree cursor, and calls
xfs_trans_brelse on the buffer previous occupying the slot.

The bc_ra update should be done for the alloc btree updated too,
although the lack of it does not cause serious problems.  The
xfs_trans_brelse call on the other hand is effectively a no-op in
the end - it keeps decrementing the bli_recur refcount until it hits
zero, and then just skips out because the buffer will always be
dirty at this point.  So removing it for the allocation btree is
just fine.

So unify the code and move it to xfs_btree.c.  While we're at it
also replace the call to xfs_btree_setbuf with a NULL bp argument in
xfs_btree_del_cursor with a direct call to xfs_trans_brelse given
that the cursor is beeing freed just after this and the state
updates are superflous.  After this xfs_btree_setbuf is only used
with a non-NULL bp argument and can thus be simplified.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc_btree.c  | 33 ------------------------------
 fs/xfs/xfs_btree.c        | 52 +++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_btree.h        | 14 +------------
 fs/xfs/xfs_ialloc_btree.c | 33 ------------------------------
 4 files changed, 44 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..3916925e2584 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -280,38 +280,6 @@ xfs_allocbt_key_diff(
 	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-STATIC int
-xfs_allocbt_kill_root(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			level,
-	union xfs_btree_ptr	*newroot)
-{
-	int			error;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, killroot);
-
-	/*
-	 * Update the root pointer, decreasing the level by 1 and then
-	 * free the old root.
-	 */
-	xfs_allocbt_set_root(cur, newroot, -1);
-	error = xfs_allocbt_free_block(cur, bp);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-
-	XFS_BTREE_STATS_INC(cur, free);
-
-	xfs_btree_setbuf(cur, level, NULL);
-	cur->bc_nlevels--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 
 	.dup_cursor		= xfs_allocbt_dup_cursor,
 	.set_root		= xfs_allocbt_set_root,
-	.kill_root		= xfs_allocbt_kill_root,
 	.alloc_block		= xfs_allocbt_alloc_block,
 	.free_block		= xfs_allocbt_free_block,
 	.update_lastrec		= xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..735dc2e671b1 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
 	 */
 	for (i = 0; i < cur->bc_nlevels; i++) {
 		if (cur->bc_bufs[i])
-			xfs_btree_setbuf(cur, i, NULL);
+			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 		else if (!error)
 			break;
 	}
@@ -763,22 +763,19 @@ xfs_btree_readahead(
  * Set the buffer for level "lev" in the cursor to bp, releasing
  * any previous buffer.
  */
-void
+STATIC void
 xfs_btree_setbuf(
 	xfs_btree_cur_t		*cur,	/* btree cursor */
 	int			lev,	/* level in btree */
 	xfs_buf_t		*bp)	/* new buffer to set */
 {
 	struct xfs_btree_block	*b;	/* btree block */
-	xfs_buf_t		*obp;	/* old buffer pointer */
 
-	obp = cur->bc_bufs[lev];
-	if (obp)
-		xfs_trans_brelse(cur->bc_tp, obp);
+	if (cur->bc_bufs[lev])
+		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
 	cur->bc_bufs[lev] = bp;
 	cur->bc_ra[lev] = 0;
-	if (!bp)
-		return;
+
 	b = XFS_BUF_TO_BLOCK(bp);
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 		if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -3011,6 +3008,43 @@ out0:
 	return 0;
 }
 
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	union xfs_btree_ptr	*newroot)
+{
+	int			error;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+	/*
+	 * Update the root pointer, decreasing the level by 1 and then
+	 * free the old root.
+	 */
+	cur->bc_ops->set_root(cur, newroot, -1);
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level] = NULL;
+	cur->bc_ra[level] = 0;
+	cur->bc_nlevels--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
 STATIC int
 xfs_btree_dec_cursor(
 	struct xfs_btree_cur	*cur,
@@ -3195,7 +3229,7 @@ xfs_btree_delrec(
 			 * Make it the new root of the btree.
 			 */
 			pp = xfs_btree_ptr_addr(cur, 1, block);
-			error = cur->bc_ops->kill_root(cur, bp, level, pp);
+			error = xfs_btree_kill_root(cur, bp, level, pp);
 			if (error)
 				goto error0;
 		} else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
 
 	/* update btree root pointer */
 	void	(*set_root)(struct xfs_btree_cur *cur,
-				union xfs_btree_ptr *nptr, int level_change);
-	int	(*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
-				int level, union xfs_btree_ptr *newroot);
+			    union xfs_btree_ptr *nptr, int level_change);
 
 	/* block allocation / freeing */
 	int	(*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
 	xfs_agblock_t		agbno,	/* allocation group block number */
 	xfs_extlen_t		count);	/* count of filesystem blocks */
 
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			lev,	/* level in btree */
-	struct xfs_buf		*bp);	/* new buffer to set */
-
 
 /*
  * Common btree core entry points.
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
 			  cur->bc_rec.i.ir_startino;
 }
 
-STATIC int
-xfs_inobt_kill_root(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			level,
-	union xfs_btree_ptr	*newroot)
-{
-	int			error;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, killroot);
-
-	/*
-	 * Update the root pointer, decreasing the level by 1 and then
-	 * free the old root.
-	 */
-	xfs_inobt_set_root(cur, newroot, -1);
-	error = xfs_inobt_free_block(cur, bp);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-
-	XFS_BTREE_STATS_INC(cur, free);
-
-	cur->bc_bufs[level] = NULL;
-	cur->bc_nlevels--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 
 	.dup_cursor		= xfs_inobt_dup_cursor,
 	.set_root		= xfs_inobt_set_root,
-	.kill_root		= xfs_inobt_kill_root,
 	.alloc_block		= xfs_inobt_alloc_block,
 	.free_block		= xfs_inobt_free_block,
 	.get_minrecs		= xfs_inobt_get_minrecs,
-- 
cgit v1.2.3


From 9c169915ad374cd9efb1556943b2074ec07e1749 Mon Sep 17 00:00:00 2001
From: Poyo VL <poyo_vl@yahoo.com>
Date: Thu, 2 Sep 2010 07:41:55 +0000
Subject: xfs: eliminate some newly-reported gcc warnings

Ionut Gabriel Popescu <poyo_vl@yahoo.com> submitted a simple change
to eliminate some "may be used uninitialized" warnings when building
XFS.  The reported condition seems to be something that GCC did not
used to recognize or report.  The warnings were produced by:

    gcc version 4.5.0 20100604
    [gcc-4_5-branch revision 160292] (SUSE Linux)

Signed-off-by: Ionut Gabriel Popescu <poyo_vl@yahoo.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..112abc439ca5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -675,7 +675,7 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	gtbnoa;		/* aligned ... */
 	xfs_extlen_t	gtdiff;		/* difference to right side entry */
 	xfs_extlen_t	gtlen;		/* length of right side entry */
-	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_extlen_t	gtlena = 0;	/* aligned ... */
 	xfs_agblock_t	gtnew;		/* useful start bno of right side */
 	int		error;		/* error code */
 	int		i;		/* result code, temporary */
@@ -684,7 +684,7 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	ltbnoa;		/* aligned ... */
 	xfs_extlen_t	ltdiff;		/* difference to left side entry */
 	xfs_extlen_t	ltlen;		/* length of left side entry */
-	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_extlen_t	ltlena = 0;	/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
 #if defined(DEBUG) && defined(__KERNEL__)
-- 
cgit v1.2.3


From d1583a3833290ab9f8b13a064acbb5e508c59f60 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 18:14:13 +1000
Subject: xfs: reduce the number of CIL lock round trips during commit

When commiting a transaction, we do a lock CIL state lock round trip
on every single log vector we insert into the CIL. This is resulting
in the lock being as hot as the inode and dcache locks on 8-way
create workloads. Rework the insertion loops to bring the number
of lock round trips to one per transaction for log vectors, and one
more do the busy extents.

Also change the allocation of the log vector buffer not to zero it
as we copy over the entire allocated buffer anyway.

This patch also includes a structural cleanup to the CIL item
insertion provided by Christoph Hellwig.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log_cil.c | 232 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 127 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..23d6ceb5e97b 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -145,102 +145,6 @@ xlog_cil_init_post_recovery(
 								log->l_curr_block);
 }
 
-/*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
-	struct log		*log,
-	struct xlog_ticket	*ticket,
-	struct xfs_log_item	*item,
-	struct xfs_log_vec	*lv)
-{
-	struct xfs_cil		*cil = log->l_cilp;
-	struct xfs_log_vec	*old = lv->lv_item->li_lv;
-	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
-	int			len;
-	int			diff_iovecs;
-	int			iclog_space;
-
-	if (old) {
-		/* existing lv on log item, space used is a delta */
-		ASSERT(!list_empty(&item->li_cil));
-		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-
-		len = lv->lv_buf_len - old->lv_buf_len;
-		diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
-		kmem_free(old->lv_buf);
-		kmem_free(old);
-	} else {
-		/* new lv, must pin the log item */
-		ASSERT(!lv->lv_item->li_lv);
-		ASSERT(list_empty(&item->li_cil));
-
-		len = lv->lv_buf_len;
-		diff_iovecs = lv->lv_niovecs;
-		IOP_PIN(lv->lv_item);
-
-	}
-	len += diff_iovecs * sizeof(xlog_op_header_t);
-
-	/* attach new log vector to log item */
-	lv->lv_item->li_lv = lv;
-
-	spin_lock(&cil->xc_cil_lock);
-	list_move_tail(&item->li_cil, &cil->xc_cil);
-	ctx->nvecs += diff_iovecs;
-
-	/*
-	 * If this is the first time the item is being committed to the CIL,
-	 * store the sequence number on the log item so we can tell
-	 * in future commits whether this is the first checkpoint the item is
-	 * being committed into.
-	 */
-	if (!item->li_seq)
-		item->li_seq = ctx->sequence;
-
-	/*
-	 * Now transfer enough transaction reservation to the context ticket
-	 * for the checkpoint. The context ticket is special - the unit
-	 * reservation has to grow as well as the current reservation as we
-	 * steal from tickets so we can correctly determine the space used
-	 * during the transaction commit.
-	 */
-	if (ctx->ticket->t_curr_res == 0) {
-		/* first commit in checkpoint, steal the header reservation */
-		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
-		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-		ticket->t_curr_res -= ctx->ticket->t_unit_res;
-	}
-
-	/* do we need space for more log record headers? */
-	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-	if (len > 0 && (ctx->space_used / iclog_space !=
-				(ctx->space_used + len) / iclog_space)) {
-		int hdrs;
-
-		hdrs = (len + iclog_space - 1) / iclog_space;
-		/* need to take into account split region headers, too */
-		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-		ctx->ticket->t_unit_res += hdrs;
-		ctx->ticket->t_curr_res += hdrs;
-		ticket->t_curr_res -= hdrs;
-		ASSERT(ticket->t_curr_res >= len);
-	}
-	ticket->t_curr_res -= len;
-	ctx->space_used += len;
-
-	spin_unlock(&cil->xc_cil_lock);
-}
-
 /*
  * Format log item into a flat buffers
  *
@@ -286,7 +190,7 @@ xlog_cil_format_items(
 			len += lv->lv_iovecp[index].i_len;
 
 		lv->lv_buf_len = len;
-		lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+		lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
 		ptr = lv->lv_buf;
 
 		for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
 	}
 }
 
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+	struct log		*log,
+	struct xfs_log_vec	*lv,
+	int			*len,
+	int			*diff_iovecs)
+{
+	struct xfs_log_vec	*old = lv->lv_item->li_lv;
+
+	if (old) {
+		/* existing lv on log item, space used is a delta */
+		ASSERT(!list_empty(&lv->lv_item->li_cil));
+		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+		*len += lv->lv_buf_len - old->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+		kmem_free(old->lv_buf);
+		kmem_free(old);
+	} else {
+		/* new lv, must pin the log item */
+		ASSERT(!lv->lv_item->li_lv);
+		ASSERT(list_empty(&lv->lv_item->li_cil));
+
+		*len += lv->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs;
+		IOP_PIN(lv->lv_item);
+
+	}
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	/*
+	 * If this is the first time the item is being committed to the
+	 * CIL, store the sequence number on the log item so we can
+	 * tell in future commits whether this is the first checkpoint
+	 * the item is being committed into.
+	 */
+	if (!lv->lv_item->li_seq)
+		lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
 static void
 xlog_cil_insert_items(
 	struct log		*log,
 	struct xfs_log_vec	*log_vector,
-	struct xlog_ticket	*ticket,
-	xfs_lsn_t		*start_lsn)
+	struct xlog_ticket	*ticket)
 {
-	struct xfs_log_vec *lv;
-
-	if (start_lsn)
-		*start_lsn = log->l_cilp->xc_ctx->sequence;
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	struct xfs_log_vec	*lv;
+	int			len = 0;
+	int			diff_iovecs = 0;
+	int			iclog_space;
 
 	ASSERT(log_vector);
+
+	/*
+	 * Do all the accounting aggregation and switching of log vectors
+	 * around in a separate loop to the insertion of items into the CIL.
+	 * Then we can do a separate loop to update the CIL within a single
+	 * lock/unlock pair. This reduces the number of round trips on the CIL
+	 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+	 * hold time for the transaction commit.
+	 *
+	 * If this is the first time the item is being placed into the CIL in
+	 * this context, pin it so it can't be written to disk until the CIL is
+	 * flushed to the iclog and the iclog written to disk.
+	 *
+	 * We can do this safely because the context can't checkpoint until we
+	 * are done so it doesn't matter exactly how we update the CIL.
+	 */
+	for (lv = log_vector; lv; lv = lv->lv_next)
+		xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+
+	/* account for space used by new iovec headers  */
+	len += diff_iovecs * sizeof(xlog_op_header_t);
+
+	spin_lock(&cil->xc_cil_lock);
+
+	/* move the items to the tail of the CIL */
 	for (lv = log_vector; lv; lv = lv->lv_next)
-		xlog_cil_insert(log, ticket, lv->lv_item, lv);
+		list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+
+	ctx->nvecs += diff_iovecs;
+
+	/*
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
+	 */
+	if (ctx->ticket->t_curr_res == 0) {
+		/* first commit in checkpoint, steal the header reservation */
+		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		ticket->t_curr_res -= hdrs;
+		ASSERT(ticket->t_curr_res >= len);
+	}
+	ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
 }
 
 static void
@@ -638,7 +657,10 @@ xfs_log_commit_cil(
 
 	/* lock out background commit */
 	down_read(&log->l_cilp->xc_ctx_lock);
-	xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+	if (commit_lsn)
+		*commit_lsn = log->l_cilp->xc_ctx->sequence;
+
+	xlog_cil_insert_items(log, log_vector, tp->t_ticket);
 
 	/* check we didn't blow the reservation */
 	if (tp->t_ticket->t_curr_res < 0)
-- 
cgit v1.2.3


From bd32d25a7cf7242512e77e70bab63df4402ab91c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 22 Sep 2010 10:47:20 +1000
Subject: xfs: remove debug assert for per-ag reference counting

When we start taking references per cached buffer to the the perag
it is cached on, it will blow the current debug maximum reference
count assert out of the water. The assert has never caught a bug,
and we have tracing to track changes if there ever is a problem,
so just remove it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..00c7a876807d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
 	if (pag) {
 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
-		/* catch leaks in the positive direction during testing */
-		ASSERT(atomic_read(&pag->pag_ref) < 1000);
 		ref = atomic_inc_return(&pag->pag_ref);
 	}
 	spin_unlock(&mp->m_perag_lock);
-- 
cgit v1.2.3


From e176579e70118ed7cfdb60f963628fe0ca771f3d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 22 Sep 2010 10:47:20 +1000
Subject: xfs: lockless per-ag lookups

When we start taking a reference to the per-ag for every cached
buffer in the system, kernel lockstat profiling on an 8-way create
workload shows the mp->m_perag_lock has higher acquisition rates
than the inode lock and has significantly more contention. That is,
it becomes the highest contended lock in the system.

The perag lookup is trivial to convert to lock-less RCU lookups
because perag structures never go away. Hence the only thing we need
to protect against is tree structure changes during a grow. This can
be done simply by replacing the locking in xfs_perag_get() with RCU
read locking. This removes the mp->m_perag_lock completely from this
path.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c |  6 +++---
 fs/xfs/xfs_ag.h             |  3 +++
 fs/xfs/xfs_mount.c          | 25 +++++++++++++++++--------
 3 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..3a1d229b4784 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -150,17 +150,17 @@ xfs_inode_ag_iter_next_pag(
 		int found;
 		int ref;
 
-		spin_lock(&mp->m_perag_lock);
+		rcu_read_lock();
 		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
 				(void **)&pag, *first, 1, tag);
 		if (found <= 0) {
-			spin_unlock(&mp->m_perag_lock);
+			rcu_read_unlock();
 			return NULL;
 		}
 		*first = pag->pag_agno + 1;
 		/* open coded pag reference increment */
 		ref = atomic_inc_return(&pag->pag_ref);
-		spin_unlock(&mp->m_perag_lock);
+		rcu_read_unlock();
 		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
 	} else {
 		pag = xfs_perag_get(mp, *first);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..51c42c202bf1 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,9 @@ typedef struct xfs_perag {
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
+
+	/* for rcu-safe freeing */
+	struct rcu_head	rcu_head;
 #endif
 	int		pagb_count;	/* pagb slots in use */
 } xfs_perag_t;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00c7a876807d..14fc6e9e1816 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -199,6 +199,8 @@ xfs_uuid_unmount(
 
 /*
  * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
  */
 struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,13 +208,13 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 	struct xfs_perag	*pag;
 	int			ref = 0;
 
-	spin_lock(&mp->m_perag_lock);
+	rcu_read_lock();
 	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
 	if (pag) {
 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
 		ref = atomic_inc_return(&pag->pag_ref);
 	}
-	spin_unlock(&mp->m_perag_lock);
+	rcu_read_unlock();
 	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
 	return pag;
 }
@@ -227,10 +229,18 @@ xfs_perag_put(struct xfs_perag *pag)
 	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
 }
 
+STATIC void
+__xfs_free_perag(
+	struct rcu_head	*head)
+{
+	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+	ASSERT(atomic_read(&pag->pag_ref) == 0);
+	kmem_free(pag);
+}
+
 /*
- * Free up the resources associated with a mount structure.  Assume that
- * the structure was initially zeroed, so we can tell which fields got
- * initialized.
+ * Free up the per-ag resources associated with the mount structure.
  */
 STATIC void
 xfs_free_perag(
@@ -242,10 +252,9 @@ xfs_free_perag(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		spin_lock(&mp->m_perag_lock);
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
-		ASSERT(pag);
-		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		spin_unlock(&mp->m_perag_lock);
-		kmem_free(pag);
+		ASSERT(pag);
+		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
 
-- 
cgit v1.2.3


From dcd79a1423f64ee0184629874805c3ac40f3a2c5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 28 Sep 2010 12:27:25 +1000
Subject: xfs: don't use vfs writeback for pure metadata modifications

Under heavy multi-way parallel create workloads, the VFS struggles
to write back all the inodes that have been changed in age order.
The bdi flusher thread becomes CPU bound, spending 85% of it's time
in the VFS code, mostly traversing the superblock dirty inode list
to separate dirty inodes old enough to flush.

We already keep an index of all metadata changes in age order - in
the AIL - and continued log pressure will do age ordered writeback
without any extra overhead at all. If there is no pressure on the
log, the xfssyncd will periodically write back metadata in ascending
disk address offset order so will be very efficient.

Hence we can stop marking VFS inodes dirty during transaction commit
or when changing timestamps during transactions. This will keep the
inodes in the superblock dirty list to those containing data or
unlogged metadata changes.

However, the timstamp changes are slightly more complex than this -
there are a couple of places that do unlogged updates of the
timestamps, and the VFS need to be informed of these. Hence add a
new function xfs_trans_ichgtime() for transactional changes,
and leave xfs_ichgtime() for the non-transactional changes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_ioctl.c   |  2 +-
 fs/xfs/linux-2.6/xfs_iops.c    | 35 -----------------------------------
 fs/xfs/linux-2.6/xfs_super.c   |  7 +------
 fs/xfs/quota/xfs_qm_syscalls.c |  2 +-
 fs/xfs/xfs_attr.c              | 31 +++++++++++--------------------
 fs/xfs/xfs_inode.h             |  1 -
 fs/xfs/xfs_inode_item.c        |  9 ---------
 fs/xfs/xfs_rename.c            | 12 ++++++++----
 fs/xfs/xfs_trans.h             |  1 +
 fs/xfs/xfs_trans_inode.c       | 30 ++++++++++++++++++++++++++++++
 fs/xfs/xfs_utils.c             |  4 ++--
 fs/xfs/xfs_vnodeops.c          | 17 ++++++++++-------
 12 files changed, 65 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 03aa908a9cb9..10206be7a077 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1088,8 +1088,8 @@ xfs_ioctl_setattr(
 		xfs_diflags_to_linux(ip);
 	}
 
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
 
 	XFS_STATS_INC(xs_ig_attrchg);
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..a788f016d1fa 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -94,41 +94,6 @@ xfs_mark_inode_dirty(
 		mark_inode_dirty(inode);
 }
 
-/*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- */
-void
-xfs_ichgtime(
-	xfs_inode_t	*ip,
-	int		flags)
-{
-	struct inode	*inode = VFS_I(ip);
-	timespec_t	tv;
-	int		sync_it = 0;
-
-	tv = current_fs_time(inode->i_sb);
-
-	if ((flags & XFS_ICHGTIME_MOD) &&
-	    !timespec_equal(&inode->i_mtime, &tv)) {
-		inode->i_mtime = tv;
-		sync_it = 1;
-	}
-	if ((flags & XFS_ICHGTIME_CHG) &&
-	    !timespec_equal(&inode->i_ctime, &tv)) {
-		inode->i_ctime = tv;
-		sync_it = 1;
-	}
-
-	/*
-	 * Update complete - now make sure everyone knows that the inode
-	 * is dirty.
-	 */
-	if (sync_it)
-		xfs_mark_inode_dirty_sync(ip);
-}
-
 /*
  * Hook in SELinux.  This is not quite correct yet, what we really need
  * here (as we do for default ACLs) is a mechanism by which creation of
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..83154c0a3175 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -972,12 +972,7 @@ xfs_fs_inode_init_once(
 
 /*
  * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode. Care must be taken
- * here - the transaction code calls mark_inode_dirty_sync() to mark the
- * VFS inode dirty in a transaction and clears the i_update_core field;
- * it must clear the field after calling mark_inode_dirty_sync() to
- * correctly indicate that the dirty state has been propagated into the
- * inode log item.
+ * we catch unlogged VFS level updates to the inode.
  *
  * We need the barrier() to maintain correct ordering between unlogged
  * updates and the transaction commit code that clears the i_update_core
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..7a71336f7922 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
 		goto out_unlock;
 	}
 
-	xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
 out_unlock:
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..905d390c1e5c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
 			if (mp->m_flags & XFS_MOUNT_WSYNC) {
 				xfs_trans_set_sync(args.trans);
 			}
+
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_trans_ichgtime(args.trans, dp,
+							XFS_ICHGTIME_CHG);
+			}
 			err2 = xfs_trans_commit(args.trans,
 						 XFS_TRANS_RELEASE_LOG_RES);
 			xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-			/*
-			 * Hit the inode change time.
-			 */
-			if (!error && (flags & ATTR_KERNOTIME) == 0) {
-				xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-			}
 			return(error == 0 ? err2 : error);
 		}
 
@@ -420,6 +419,9 @@ xfs_attr_set_int(
 		xfs_trans_set_sync(args.trans);
 	}
 
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
 	/*
 	 * Commit the last in the sequence of transactions.
 	 */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
 	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-	/*
-	 * Hit the inode change time.
-	 */
-	if (!error && (flags & ATTR_KERNOTIME) == 0) {
-		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-	}
-
 	return(error);
 
 out:
@@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 		xfs_trans_set_sync(args.trans);
 	}
 
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
 	/*
 	 * Commit the last in the sequence of transactions.
 	 */
@@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-	/*
-	 * Hit the inode change time.
-	 */
-	if (!error && (flags & ATTR_KERNOTIME) == 0) {
-		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-	}
-
 	return(error);
 
 out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..54781fcd322a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -471,7 +471,6 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 void		xfs_iunpin_wait(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
-void		xfs_ichgtime(xfs_inode_t *, int);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..c7ac020705df 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -222,15 +222,6 @@ xfs_inode_item_format(
 	vecp++;
 	nvecs	     = 1;
 
-	/*
-	 * Make sure the linux inode is dirty. We do this before
-	 * clearing i_update_core as the VFS will call back into
-	 * XFS here and set i_update_core, so we need to dirty the
-	 * inode first so that the ordering of i_update_core and
-	 * unlogged modifications still works as described below.
-	 */
-	xfs_mark_inode_dirty_sync(ip);
-
 	/*
 	 * Clear i_update_core if the timestamps (or any other
 	 * non-transactional modification) need flushing/logging
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..9028733f7ed8 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -211,7 +211,9 @@ xfs_rename(
 			goto error_return;
 		if (error)
 			goto abort_return;
-		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		if (new_parent && src_is_directory) {
 			error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
 					&first_block, &free_list, spaceres);
 		if (error)
 			goto abort_return;
-		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		/*
 		 * Decrement the link count on the target since the target
@@ -292,7 +296,7 @@ xfs_rename(
 	 * inode isn't really being changed, but old unix file systems did
 	 * it and some incremental backup programs won't work without it.
 	 */
-	xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
 
 	/*
 	 * Adjust the link count on src_dp.  This is necessary when
@@ -315,7 +319,7 @@ xfs_rename(
 	if (error)
 		goto abort_return;
 
-	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
 	if (new_parent)
 		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..e2cbe4da7d8e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -473,6 +473,7 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 int		xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
 			       xfs_ino_t , uint, uint, struct xfs_inode **);
+void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void		xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..ccb34532768b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -117,6 +117,36 @@ xfs_trans_ijoin_ref(
 	ip->i_itemp->ili_lock_flags = lock_flags;
 }
 
+/*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	timespec_t		tv;
+
+	ASSERT(tp);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(ip->i_transp == tp);
+
+	tv = current_fs_time(inode->i_sb);
+
+	if ((flags & XFS_ICHGTIME_MOD) &&
+	    !timespec_equal(&inode->i_mtime, &tv)) {
+		inode->i_mtime = tv;
+	}
+	if ((flags & XFS_ICHGTIME_CHG) &&
+	    !timespec_equal(&inode->i_ctime, &tv)) {
+		inode->i_ctime = tv;
+	}
+}
+
 /*
  * This is called to mark the fields indicated in fieldmask as needing
  * to be logged when the transaction is committed.  The inode must
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..4c2ba6f6adc8 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -235,7 +235,7 @@ xfs_droplink(
 {
 	int	error;
 
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
 	ASSERT (ip->i_d.di_nlink > 0);
 	ip->i_d.di_nlink--;
@@ -299,7 +299,7 @@ xfs_bumplink(
 {
 	if (ip->i_d.di_nlink >= XFS_MAXLINK)
 		return XFS_ERROR(EMLINK);
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
 	ASSERT(ip->i_d.di_nlink > 0);
 	ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index dc6e4fb8bbc9..a230cd4d0774 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -184,8 +184,11 @@ xfs_setattr(
 		    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
-			if (mask & ATTR_CTIME)
-				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+			if (mask & ATTR_CTIME) {
+				inode->i_mtime = inode->i_ctime =
+						current_fs_time(inode->i_sb);
+				xfs_mark_inode_dirty_sync(ip);
+			}
 			code = 0;
 			goto error_return;
 		}
@@ -1391,7 +1394,7 @@ xfs_create(
 		ASSERT(error != ENOSPC);
 		goto out_trans_abort;
 	}
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
 	if (is_dir) {
@@ -1742,7 +1745,7 @@ xfs_remove(
 		ASSERT(error != ENOENT);
 		goto out_bmap_cancel;
 	}
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 	if (is_dir) {
 		/*
@@ -1895,7 +1898,7 @@ xfs_link(
 					&first_block, &free_list, resblks);
 	if (error)
 		goto abort_return;
-	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
 	error = xfs_bumplink(tp, sip);
@@ -2129,7 +2132,7 @@ xfs_symlink(
 					&first_block, &free_list, resblks);
 	if (error)
 		goto error1;
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
 	/*
@@ -2833,7 +2836,7 @@ xfs_change_file_space(
 		if (ip->i_d.di_mode & S_IXGRP)
 			ip->i_d.di_mode &= ~S_ISGID;
 
-		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	}
 	if (setprealloc)
 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-- 
cgit v1.2.3


From 686865f76e35b28ba7aa6afa19209426f0da6201 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 20:07:47 +1000
Subject: xfs: rename xfs_buf_get_nodaddr to be more appropriate

xfs_buf_get_nodaddr() is really used to allocate a buffer that is
uncached. While it is not directly assigned a disk address, the fact
that they are not cached is a more important distinction. With the
upcoming uncached buffer read primitive, we should be consistent
with this disctinction.

While there, make page allocation in xfs_buf_get_nodaddr() safe
against memory reclaim re-entrancy into the filesystem by allowing
a flags parameter to be passed.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c   | 9 +++++----
 fs/xfs/linux-2.6/xfs_buf.h   | 2 +-
 fs/xfs/linux-2.6/xfs_trace.h | 2 +-
 fs/xfs/xfs_log.c             | 3 ++-
 fs/xfs/xfs_log_recover.c     | 3 ++-
 fs/xfs/xfs_vnodeops.c        | 6 +++---
 6 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..eca945b0f88f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -707,9 +707,10 @@ xfs_buf_associate_memory(
 }
 
 xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
+	struct xfs_buftarg	*target,
 	size_t			len,
-	xfs_buftarg_t		*target)
+	int			flags)
 {
 	unsigned long		page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
 	int			error, i;
@@ -725,7 +726,7 @@ xfs_buf_get_noaddr(
 		goto fail_free_buf;
 
 	for (i = 0; i < page_count; i++) {
-		bp->b_pages[i] = alloc_page(GFP_KERNEL);
+		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
 		if (!bp->b_pages[i])
 			goto fail_free_mem;
 	}
@@ -740,7 +741,7 @@ xfs_buf_get_noaddr(
 
 	xfs_buf_unlock(bp);
 
-	trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+	trace_xfs_buf_get_uncached(bp, _RET_IP_);
 	return bp;
 
  fail_free_mem:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..fb30447091d8 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -213,7 +213,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
 extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..2a1d4fbd9ed8 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -331,7 +331,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..c8a309424307 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1131,7 +1131,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
-		bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+						log->l_iclog_size, 0);
 		if (!bp)
 			goto out_free_iclog;
 		if (!XFS_BUF_CPSEMA(bp))
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..3d887542b037 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -107,7 +107,8 @@ xlog_get_bp(
 		nbblks += log->l_sectBBsize;
 	nbblks = round_up(nbblks, log->l_sectBBsize);
 
-	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
+	return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+					BBTOB(nbblks), 0);
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a230cd4d0774..b7bdc43308e4 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2434,9 +2434,9 @@ xfs_zero_remaining_bytes(
 	if (endoff > ip->i_size)
 		endoff = ip->i_size;
 
-	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
-				XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp);
+	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp,
+				mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
 	if (!bp)
 		return XFS_ERROR(ENOMEM);
 
-- 
cgit v1.2.3


From 5adc94c247c3779782c7b0b8b5e28cf50596eb37 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 21:58:31 +1000
Subject: xfs: introduced uncached buffer read primitve

To avoid the need to use cached buffers for single-shot or buffers
cached at the filesystem level, introduce a new buffer read
primitive that bypasses the cache an reads directly from disk.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 34 ++++++++++++++++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_buf.h |  3 +++
 2 files changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index eca945b0f88f..22c7bff77ad2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -638,6 +638,40 @@ xfs_buf_readahead(
 	xfs_buf_read(target, ioff, isize, flags);
 }
 
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		daddr,
+	size_t			length,
+	int			flags)
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	bp = xfs_buf_get_uncached(target, length, flags);
+	if (!bp)
+		return NULL;
+
+	/* set up the buffer for a read IO */
+	xfs_buf_lock(bp);
+	XFS_BUF_SET_ADDR(bp, daddr);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+
+	xfsbdstrat(mp, bp);
+	error = xfs_iowait(bp);
+	if (error || bp->b_error) {
+		xfs_buf_relse(bp);
+		return NULL;
+	}
+	return bp;
+}
+
 xfs_buf_t *
 xfs_buf_get_empty(
 	size_t			len,
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index fb30447091d8..57eedc750ee6 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -218,6 +218,9 @@ extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
 extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+				struct xfs_buftarg *target,
+				xfs_daddr_t daddr, size_t length, int flags);
 
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
-- 
cgit v1.2.3


From ebad861b5702c3e2332a3e906978f47144d22f70 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 22 Sep 2010 10:47:20 +1000
Subject: xfs: store xfs_mount in the buftarg instead of in the xfs_buf

Each buffer contains both a buftarg pointer and a mount pointer. If
we add a mount pointer into the buftarg, we can avoid needing the
b_mount field in every buffer and grab it from the buftarg when
needed instead. This shrinks the xfs_buf by 8 bytes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c   |  9 ++++-----
 fs/xfs/linux-2.6/xfs_buf.h   |  5 +++--
 fs/xfs/linux-2.6/xfs_super.c |  8 +++++---
 fs/xfs/xfs_buf_item.c        |  3 +--
 fs/xfs/xfs_log_recover.c     | 16 +++++++---------
 5 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 22c7bff77ad2..d6928970097f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -894,7 +894,7 @@ xfs_buf_lock(
 	trace_xfs_buf_lock(bp, _RET_IP_);
 
 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-		xfs_log_force(bp->b_mount, 0);
+		xfs_log_force(bp->b_target->bt_mount, 0);
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
 	down(&bp->b_sema);
@@ -1017,7 +1017,6 @@ xfs_bwrite(
 {
 	int			error;
 
-	bp->b_mount = mp;
 	bp->b_flags |= XBF_WRITE;
 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
 
@@ -1038,8 +1037,6 @@ xfs_bdwrite(
 {
 	trace_xfs_buf_bdwrite(bp, _RET_IP_);
 
-	bp->b_mount = mp;
-
 	bp->b_flags &= ~XBF_READ;
 	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
 
@@ -1128,7 +1125,7 @@ int
 xfs_bdstrat_cb(
 	struct xfs_buf	*bp)
 {
-	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 		trace_xfs_bdstrat_shut(bp, _RET_IP_);
 		/*
 		 * Metadata write that didn't get logged but
@@ -1644,6 +1641,7 @@ out_error:
 
 xfs_buftarg_t *
 xfs_alloc_buftarg(
+	struct xfs_mount	*mp,
 	struct block_device	*bdev,
 	int			external,
 	const char		*fsname)
@@ -1652,6 +1650,7 @@ xfs_alloc_buftarg(
 
 	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
 
+	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
 	if (xfs_setsize_buftarg_early(btp, bdev))
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 57eedc750ee6..def2cea2b4d6 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -132,6 +132,7 @@ typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
 	struct address_space	*bt_mapping;
+	struct xfs_mount	*bt_mount;
 	unsigned int		bt_bsize;
 	unsigned int		bt_sshift;
 	size_t			bt_smask;
@@ -189,7 +190,6 @@ typedef struct xfs_buf {
 	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
-	struct xfs_mount	*b_mount;
 	unsigned short		b_error;	/* error code on I/O */
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset in first page */
@@ -377,7 +377,8 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 /*
  *	Handling of buftargs.
  */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+			struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 83154c0a3175..4759be4daa26 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -758,18 +758,20 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
 	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+							mp->m_fsname);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
 	if (logdev && logdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+							mp->m_fsname);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..ee7557611b6e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -692,8 +692,7 @@ xfs_buf_item_init(
 	 * the first.  If we do already have one, there is
 	 * nothing to do here so return.
 	 */
-	if (bp->b_mount != mp)
-		bp->b_mount = mp;
+	ASSERT(bp->b_target->bt_mount == mp);
 	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 		if (lip->li_type == XFS_LI_BUF) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3d887542b037..351d71117f16 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -322,10 +322,11 @@ xlog_recover_iodone(
 		 * this during recovery. One strike!
 		 */
 		xfs_ioerror_alert("xlog_recover_iodone",
-				  bp->b_mount, bp, XFS_BUF_ADDR(bp));
-		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+					bp->b_target->bt_mount, bp,
+					XFS_BUF_ADDR(bp));
+		xfs_force_shutdown(bp->b_target->bt_mount,
+					SHUTDOWN_META_IO_ERROR);
 	}
-	bp->b_mount = NULL;
 	XFS_BUF_CLR_IODONE_FUNC(bp);
 	xfs_biodone(bp);
 }
@@ -2276,8 +2277,7 @@ xlog_recover_do_buffer_trans(
 		XFS_BUF_STALE(bp);
 		error = xfs_bwrite(mp, bp);
 	} else {
-		ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-		bp->b_mount = mp;
+		ASSERT(bp->b_target->bt_mount == mp);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 		xfs_bdwrite(mp, bp);
 	}
@@ -2541,8 +2541,7 @@ xlog_recover_do_inode_trans(
 	}
 
 write_inode_buffer:
-	ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-	bp->b_mount = mp;
+	ASSERT(bp->b_target->bt_mount == mp);
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 	xfs_bdwrite(mp, bp);
 error:
@@ -2679,8 +2678,7 @@ xlog_recover_do_dquot_trans(
 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
 
 	ASSERT(dq_f->qlf_size == 2);
-	ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-	bp->b_mount = mp;
+	ASSERT(bp->b_target->bt_mount == mp);
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 	xfs_bdwrite(mp, bp);
 
-- 
cgit v1.2.3


From 26af655233dd486659235f3049959d2f7dafc5a1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 22 Sep 2010 10:47:20 +1000
Subject: xfs: kill XBF_FS_MANAGED buffers

Filesystem level managed buffers are buffers that have their
lifecycle controlled by the filesystem layer, not the buffer cache.
We currently cache these buffers, which makes cleanup and cache
walking somewhat troublesome. Convert the fs managed buffers to
uncached buffers obtained by via xfs_buf_get_uncached(), and remove
the XBF_FS_MANAGED special cases from the buffer cache.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 20 ++++-------------
 fs/xfs/linux-2.6/xfs_buf.h |  4 ----
 fs/xfs/xfs_mount.c         | 55 ++++++++++++++--------------------------------
 3 files changed, 20 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d6928970097f..975d6589394a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -826,8 +826,6 @@ xfs_buf_rele(
 			atomic_inc(&bp->b_hold);
 			spin_unlock(&hash->bh_lock);
 			(*(bp->b_relse)) (bp);
-		} else if (bp->b_flags & XBF_FS_MANAGED) {
-			spin_unlock(&hash->bh_lock);
 		} else {
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 			list_del_init(&bp->b_hash_list);
@@ -1433,26 +1431,16 @@ void
 xfs_wait_buftarg(
 	xfs_buftarg_t	*btp)
 {
-	xfs_buf_t	*bp, *n;
 	xfs_bufhash_t	*hash;
 	uint		i;
 
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
 		hash = &btp->bt_hash[i];
-again:
 		spin_lock(&hash->bh_lock);
-		list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-			ASSERT(btp == bp->b_target);
-			if (!(bp->b_flags & XBF_FS_MANAGED)) {
-				spin_unlock(&hash->bh_lock);
-				/*
-				 * Catch superblock reference count leaks
-				 * immediately
-				 */
-				BUG_ON(bp->b_bn == 0);
-				delay(100);
-				goto again;
-			}
+		while (!list_empty(&hash->bh_list)) {
+			spin_unlock(&hash->bh_lock);
+			delay(100);
+			spin_lock(&hash->bh_lock);
 		}
 		spin_unlock(&hash->bh_lock);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index def2cea2b4d6..1f109cee136c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
 #define XBF_DONE	(1 << 5) /* all pages in the buffer uptodate */
 #define XBF_DELWRI	(1 << 6) /* buffer has dirty pages */
 #define XBF_STALE	(1 << 7) /* buffer has been staled, do not find it */
-#define XBF_FS_MANAGED	(1 << 8) /* filesystem controls freeing memory */
 #define XBF_ORDERED	(1 << 11)/* use ordered writes */
 #define XBF_READ_AHEAD	(1 << 12)/* asynchronous read-ahead */
 #define XBF_LOG_BUFFER	(1 << 13)/* this is a buffer used for the log */
@@ -104,7 +103,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_DELWRI,		"DELWRI" }, \
 	{ XBF_STALE,		"STALE" }, \
-	{ XBF_FS_MANAGED,	"FS_MANAGED" }, \
 	{ XBF_ORDERED,		"ORDERED" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
 	{ XBF_LOCK,		"LOCK" },  	/* should never be set */\
@@ -279,8 +277,6 @@ extern void xfs_buf_terminate(void);
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
-
 #define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 14fc6e9e1816..fbca293326e5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -646,7 +646,6 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
 	unsigned int	sector_size;
-	unsigned int	extra_flags;
 	xfs_buf_t	*bp;
 	int		error;
 
@@ -659,28 +658,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
 
-	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
-			  extra_flags);
-	if (!bp || XFS_BUF_ISERROR(bp)) {
-		xfs_fs_mount_cmn_err(flags, "SB read failed");
-		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-		goto fail;
+reread:
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					XFS_SB_DADDR, sector_size, 0);
+	if (!bp) {
+		xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+		return EIO;
 	}
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 
 	/*
 	 * Initialize the mount structure from the superblock.
 	 * But first do some basic consistency checking.
 	 */
 	xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
-
 	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
 	if (error) {
 		xfs_fs_mount_cmn_err(flags, "SB validate failed");
-		goto fail;
+		goto release_buf;
 	}
 
 	/*
@@ -691,7 +686,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 			"device supports only %u byte sectors (not %u)",
 			sector_size, mp->m_sb.sb_sectsize);
 		error = ENOSYS;
-		goto fail;
+		goto release_buf;
 	}
 
 	/*
@@ -699,33 +694,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * re-read the superblock so the buffer is correctly sized.
 	 */
 	if (sector_size < mp->m_sb.sb_sectsize) {
-		XFS_BUF_UNMANAGE(bp);
 		xfs_buf_relse(bp);
 		sector_size = mp->m_sb.sb_sectsize;
-		bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
-				  BTOBB(sector_size), extra_flags);
-		if (!bp || XFS_BUF_ISERROR(bp)) {
-			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
-			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-			goto fail;
-		}
-		ASSERT(XFS_BUF_ISBUSY(bp));
-		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		goto reread;
 	}
 
 	/* Initialize per-cpu counters */
 	xfs_icsb_reinit_counters(mp);
 
 	mp->m_sb_bp = bp;
-	xfs_buf_relse(bp);
-	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
+	xfs_buf_unlock(bp);
 	return 0;
 
- fail:
-	if (bp) {
-		XFS_BUF_UNMANAGE(bp);
-		xfs_buf_relse(bp);
-	}
+release_buf:
+	xfs_buf_relse(bp);
 	return error;
 }
 
@@ -2005,18 +1987,13 @@ xfs_getsb(
  */
 void
 xfs_freesb(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf		*bp = mp->m_sb_bp;
 
-	/*
-	 * Use xfs_getsb() so that the buffer will be locked
-	 * when we call xfs_buf_relse().
-	 */
-	bp = xfs_getsb(mp, 0);
-	XFS_BUF_UNMANAGE(bp);
-	xfs_buf_relse(bp);
+	xfs_buf_lock(bp);
 	mp->m_sb_bp = NULL;
+	xfs_buf_relse(bp);
 }
 
 /*
-- 
cgit v1.2.3


From 1922c949c59f93beb560d59874bcc6d5c00115ac Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 22 Sep 2010 10:47:20 +1000
Subject: xfs: use unhashed buffers for size checks

When we are checking we can access the last block of each device, we
do not need to use cached buffers as they will be tossed away
immediately. Use uncached buffers for size checks so that all IO
prior to full in-memory structure initialisation does not use the
buffer cache.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_fsops.c   | 11 +++++------
 fs/xfs/xfs_mount.c   | 39 ++++++++++++++++-----------------------
 fs/xfs/xfs_rtalloc.c | 29 +++++++++++++----------------
 3 files changed, 34 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..6a1edb1348f6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
 	dpct = pct - mp->m_sb.sb_imax_pct;
-	error = xfs_read_buf(mp, mp->m_ddev_targp,
-			XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp);
-	if (error)
-		return error;
-	ASSERT(bp);
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+				BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+	if (!bp)
+		return EIO;
 	xfs_buf_relse(bp);
 
 	new = nb;	/* use new as a temporary here */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fbca293326e5..912101d280bf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -980,42 +980,35 @@ xfs_check_sizes(xfs_mount_t *mp)
 {
 	xfs_buf_t	*bp;
 	xfs_daddr_t	d;
-	int		error;
 
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-		cmn_err(CE_WARN, "XFS: size check 1 failed");
+		cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
 		return XFS_ERROR(EFBIG);
 	}
-	error = xfs_read_buf(mp, mp->m_ddev_targp,
-			     d - XFS_FSS_TO_BB(mp, 1),
-			     XFS_FSS_TO_BB(mp, 1), 0, &bp);
-	if (!error) {
-		xfs_buf_relse(bp);
-	} else {
-		cmn_err(CE_WARN, "XFS: size check 2 failed");
-		if (error == ENOSPC)
-			error = XFS_ERROR(EFBIG);
-		return error;
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					d - XFS_FSS_TO_BB(mp, 1),
+					BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+	if (!bp) {
+		cmn_err(CE_WARN, "XFS: last sector read failed");
+		return EIO;
 	}
+	xfs_buf_relse(bp);
 
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			cmn_err(CE_WARN, "XFS: log size mismatch detected");
 			return XFS_ERROR(EFBIG);
 		}
-		error = xfs_read_buf(mp, mp->m_logdev_targp,
-				     d - XFS_FSB_TO_BB(mp, 1),
-				     XFS_FSB_TO_BB(mp, 1), 0, &bp);
-		if (!error) {
-			xfs_buf_relse(bp);
-		} else {
-			cmn_err(CE_WARN, "XFS: size check 3 failed");
-			if (error == ENOSPC)
-				error = XFS_ERROR(EFBIG);
-			return error;
+		bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1), 0);
+		if (!bp) {
+			cmn_err(CE_WARN, "XFS: log device read failed");
+			return EIO;
 		}
+		xfs_buf_relse(bp);
 	}
 	return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..12a191385310 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_buf.h"
 
 
 /*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
 	/*
 	 * Read in the last block of the device, make sure it exists.
 	 */
-	error = xfs_read_buf(mp, mp->m_rtdev_targp,
-			XFS_FSB_TO_BB(mp, nrblocks - 1),
-			XFS_FSB_TO_BB(mp, 1), 0, &bp);
-	if (error)
-		return error;
-	ASSERT(bp);
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, nrblocks - 1),
+				XFS_FSB_TO_B(mp, 1), 0);
+	if (!bp)
+		return EIO;
 	xfs_buf_relse(bp);
+
 	/*
 	 * Calculate new parameters.  These are the final values to be reached.
 	 */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
 {
 	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
 	xfs_daddr_t	d;	/* address of last block of subvolume */
-	int		error;	/* error return value */
 	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
 
 	sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
 			(unsigned long long) mp->m_sb.sb_rblocks);
 		return XFS_ERROR(EFBIG);
 	}
-	error = xfs_read_buf(mp, mp->m_rtdev_targp,
-				d - XFS_FSB_TO_BB(mp, 1),
-				XFS_FSB_TO_BB(mp, 1), 0, &bp);
-	if (error) {
-		cmn_err(CE_WARN,
-	"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
-		if (error == ENOSPC)
-			return XFS_ERROR(EFBIG);
-		return error;
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1), 0);
+	if (!bp) {
+		cmn_err(CE_WARN, "XFS: realtime device size check failed");
+		return EIO;
 	}
 	xfs_buf_relse(bp);
 	return 0;
-- 
cgit v1.2.3


From 69d6cc76cff3573ceefda178b75e20878866fdc3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 22 Sep 2010 10:47:20 +1000
Subject: xfs: remove buftarg hash for external devices

For RT and external log devices, we never use hashed buffers on them
now.  Remove the buftarg hash tables that are set up for them.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 975d6589394a..251bcdc6352e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1458,7 +1458,11 @@ xfs_alloc_bufhash(
 {
 	unsigned int		i;
 
-	btp->bt_hashshift = external ? 3 : 12;	/* 8 or 4096 buckets */
+	if (external) {
+		btp->bt_hash = NULL;
+		return;
+	}
+	btp->bt_hashshift = 12;	/* 4096 buckets */
 	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
 					 sizeof(xfs_bufhash_t));
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-- 
cgit v1.2.3


From 65d0f20533c503b50bd5e7e86434512af7761eea Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 18:40:15 +1000
Subject: xfs: split inode AG walking into separate code for reclaim

The reclaim walk requires different locking and has a slightly
different walk algorithm, so separate it out so that it can be
optimised separately.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c    | 202 +++++++++++++++++++----------------------
 fs/xfs/linux-2.6/xfs_sync.h    |   2 +-
 fs/xfs/linux-2.6/xfs_trace.h   |   2 +-
 fs/xfs/quota/xfs_qm_syscalls.c |   3 +-
 fs/xfs/xfs_mount.c             |  26 ++++++
 fs/xfs/xfs_mount.h             |   2 +
 6 files changed, 122 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3a1d229b4784..b5cdf0ef39ec 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -40,78 +40,46 @@
 #include <linux/freezer.h>
 
 
-STATIC xfs_inode_t *
-xfs_inode_ag_lookup(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	uint32_t		*first_index,
-	int			tag)
-{
-	int			nr_found;
-	struct xfs_inode	*ip;
-
-	/*
-	 * use a gang lookup to find the next inode in the tree
-	 * as the tree is sparse and a gang lookup walks to find
-	 * the number of objects requested.
-	 */
-	if (tag == XFS_ICI_NO_TAG) {
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-				(void **)&ip, *first_index, 1);
-	} else {
-		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
-				(void **)&ip, *first_index, 1, tag);
-	}
-	if (!nr_found)
-		return NULL;
-
-	/*
-	 * Update the index for the next lookup. Catch overflows
-	 * into the next AG range which can occur if we have inodes
-	 * in the last block of the AG and we are currently
-	 * pointing to the last inode.
-	 */
-	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-		return NULL;
-	return ip;
-}
-
 STATIC int
 xfs_inode_ag_walk(
 	struct xfs_mount	*mp,
 	struct xfs_perag	*pag,
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
-	int			flags,
-	int			tag,
-	int			exclusive,
-	int			*nr_to_scan)
+	int			flags)
 {
 	uint32_t		first_index;
 	int			last_error = 0;
 	int			skipped;
+	int			done;
 
 restart:
+	done = 0;
 	skipped = 0;
 	first_index = 0;
 	do {
 		int		error = 0;
+		int		nr_found;
 		xfs_inode_t	*ip;
 
-		if (exclusive)
-			write_lock(&pag->pag_ici_lock);
-		else
-			read_lock(&pag->pag_ici_lock);
-		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-		if (!ip) {
-			if (exclusive)
-				write_unlock(&pag->pag_ici_lock);
-			else
-				read_unlock(&pag->pag_ici_lock);
+		read_lock(&pag->pag_ici_lock);
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+				(void **)&ip, first_index, 1);
+		if (!nr_found) {
+			read_unlock(&pag->pag_ici_lock);
 			break;
 		}
 
+		/*
+		 * Update the index for the next lookup. Catch overflows
+		 * into the next AG range which can occur if we have inodes
+		 * in the last block of the AG and we are currently
+		 * pointing to the last inode.
+		 */
+		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+			done = 1;
+
 		/* execute releases pag->pag_ici_lock */
 		error = execute(ip, pag, flags);
 		if (error == EAGAIN) {
@@ -125,7 +93,7 @@ restart:
 		if (error == EFSCORRUPTED)
 			break;
 
-	} while ((*nr_to_scan)--);
+	} while (!done);
 
 	if (skipped) {
 		delay(1);
@@ -134,73 +102,29 @@ restart:
 	return last_error;
 }
 
-/*
- * Select the next per-ag structure to iterate during the walk. The reclaim
- * walk is optimised only to walk AGs with reclaimable inodes in them.
- */
-static struct xfs_perag *
-xfs_inode_ag_iter_next_pag(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		*first,
-	int			tag)
-{
-	struct xfs_perag	*pag = NULL;
-
-	if (tag == XFS_ICI_RECLAIM_TAG) {
-		int found;
-		int ref;
-
-		rcu_read_lock();
-		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-				(void **)&pag, *first, 1, tag);
-		if (found <= 0) {
-			rcu_read_unlock();
-			return NULL;
-		}
-		*first = pag->pag_agno + 1;
-		/* open coded pag reference increment */
-		ref = atomic_inc_return(&pag->pag_ref);
-		rcu_read_unlock();
-		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
-	} else {
-		pag = xfs_perag_get(mp, *first);
-		(*first)++;
-	}
-	return pag;
-}
-
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
-	int			flags,
-	int			tag,
-	int			exclusive,
-	int			*nr_to_scan)
+	int			flags)
 {
 	struct xfs_perag	*pag;
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
-	int			nr;
 
-	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
 	ag = 0;
-	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
-		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
-						exclusive, &nr);
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
 			if (error == EFSCORRUPTED)
 				break;
 		}
-		if (nr <= 0)
-			break;
 	}
-	if (nr_to_scan)
-		*nr_to_scan = nr;
 	return XFS_ERROR(last_error);
 }
 
@@ -318,8 +242,7 @@ xfs_sync_data(
 
 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-				      XFS_ICI_NO_TAG, 0, NULL);
+	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -337,8 +260,7 @@ xfs_sync_attr(
 {
 	ASSERT((flags & ~SYNC_WAIT) == 0);
 
-	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-				     XFS_ICI_NO_TAG, 0, NULL);
+	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
 }
 
 STATIC int
@@ -868,13 +790,72 @@ reclaim:
 
 }
 
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+
+		ag = pag->pag_agno + 1;
+
+		do {
+			struct xfs_inode *ip;
+			int	nr_found;
+
+			write_lock(&pag->pag_ici_lock);
+			nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+					(void **)&ip, first_index, 1,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				write_unlock(&pag->pag_ici_lock);
+				break;
+			}
+
+			/*
+			 * Update the index for the next lookup. Catch overflows
+			 * into the next AG range which can occur if we have inodes
+			 * in the last block of the AG and we are currently
+			 * pointing to the last inode.
+			 */
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+
+			error = xfs_reclaim_inode(ip, pag, flags);
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
+
+		} while (!done && (*nr_to_scan)--);
+
+		xfs_perag_put(pag);
+	}
+	return XFS_ERROR(last_error);
+}
+
 int
 xfs_reclaim_inodes(
 	xfs_mount_t	*mp,
 	int		mode)
 {
-	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-					XFS_ICI_RECLAIM_TAG, 1, NULL);
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
 }
 
 /*
@@ -896,17 +877,16 @@ xfs_reclaim_inode_shrink(
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 
-		xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
-					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
-		/* if we don't exhaust the scan, don't bother coming back */
+		xfs_reclaim_inodes_ag(mp, 0, &nr_to_scan);
+		/* terminate if we don't exhaust the scan */
 		if (nr_to_scan > 0)
 			return -1;
        }
 
 	reclaimable = 0;
 	ag = 0;
-	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
-					XFS_ICI_RECLAIM_TAG))) {
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
 		reclaimable += pag->pag_ici_reclaimable;
 		xfs_perag_put(pag);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..e8a352896d20 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -50,7 +50,7 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags, int tag, int write_lock, int *nr_to_scan);
+	int flags);
 
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2a1d4fbd9ed8..286dc201c5b9 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name,	\
 		 unsigned long caller_ip),					\
 	TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 7a71336f7922..ac11fbef37fc 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -918,8 +918,7 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
-				XFS_ICI_NO_TAG, 0, NULL);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
 }
 
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 912101d280bf..d66e87c7c3a6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -219,6 +219,32 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 	return pag;
 }
 
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		first,
+	int			tag)
+{
+	struct xfs_perag	*pag;
+	int			found;
+	int			ref;
+
+	rcu_read_lock();
+	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+					(void **)&pag, first, 1, tag);
+	if (found <= 0) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	ref = atomic_inc_return(&pag->pag_ref);
+	rcu_read_unlock();
+	trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+	return pag;
+}
+
 void
 xfs_perag_put(struct xfs_perag *pag)
 {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..7ab240930ba5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,6 +327,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
  * perag get/put wrappers for ref counting
  */
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+					int tag);
 void	xfs_perag_put(struct xfs_perag *pag);
 
 /*
-- 
cgit v1.2.3


From e13de955ca67b0bd1cec9a2f9352a3053065bf7f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 28 Sep 2010 12:28:06 +1000
Subject: xfs: split out inode walk inode grabbing

When doing read side inode cache walks, the code to validate and
grab an inode is common to all callers. Split it out of the execute
callbacks in preparation for batching lookups. Similarly, split out
the inode reference dropping from the execute callbacks into the
main lookup look to be symmetric with the grab.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c    | 79 ++++++++++++++++++------------------------
 fs/xfs/quota/xfs_qm_syscalls.c |  9 -----
 2 files changed, 34 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b5cdf0ef39ec..37fc2c0a4d25 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,6 +39,33 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EFSCORRUPTED;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		return ENOENT;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return ENOENT;
+
+	if (is_bad_inode(inode)) {
+		IRELE(ip);
+		return ENOENT;
+	}
+
+	/* inode is valid */
+	return 0;
+}
+
 
 STATIC int
 xfs_inode_ag_walk(
@@ -80,8 +107,14 @@ restart:
 		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 			done = 1;
 
-		/* execute releases pag->pag_ici_lock */
+		if (xfs_inode_ag_walk_grab(ip)) {
+			read_unlock(&pag->pag_ici_lock);
+			continue;
+		}
+		read_unlock(&pag->pag_ici_lock);
+
 		error = execute(ip, pag, flags);
+		IRELE(ip);
 		if (error == EAGAIN) {
 			skipped++;
 			continue;
@@ -128,40 +161,6 @@ xfs_inode_ag_iterator(
 	return XFS_ERROR(last_error);
 }
 
-/* must be called with pag_ici_lock held and releases it */
-int
-xfs_sync_inode_valid(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag)
-{
-	struct inode		*inode = VFS_I(ip);
-	int			error = EFSCORRUPTED;
-
-	/* nothing to sync during shutdown */
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		goto out_unlock;
-
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	error = ENOENT;
-	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		goto out_unlock;
-
-	/* If we can't grab the inode, it must on it's way to reclaim. */
-	if (!igrab(inode))
-		goto out_unlock;
-
-	if (is_bad_inode(inode)) {
-		IRELE(ip);
-		goto out_unlock;
-	}
-
-	/* inode is valid */
-	error = 0;
-out_unlock:
-	read_unlock(&pag->pag_ici_lock);
-	return error;
-}
-
 STATIC int
 xfs_sync_inode_data(
 	struct xfs_inode	*ip,
@@ -172,10 +171,6 @@ xfs_sync_inode_data(
 	struct address_space *mapping = inode->i_mapping;
 	int			error = 0;
 
-	error = xfs_sync_inode_valid(ip, pag);
-	if (error)
-		return error;
-
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		goto out_wait;
 
@@ -192,7 +187,6 @@ xfs_sync_inode_data(
  out_wait:
 	if (flags & SYNC_WAIT)
 		xfs_ioend_wait(ip);
-	IRELE(ip);
 	return error;
 }
 
@@ -204,10 +198,6 @@ xfs_sync_inode_attr(
 {
 	int			error = 0;
 
-	error = xfs_sync_inode_valid(ip, pag);
-	if (error)
-		return error;
-
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	if (xfs_inode_clean(ip))
 		goto out_unlock;
@@ -226,7 +216,6 @@ xfs_sync_inode_attr(
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	IRELE(ip);
 	return error;
 }
 
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index ac11fbef37fc..57847434fa51 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -875,21 +875,14 @@ xfs_dqrele_inode(
 	struct xfs_perag	*pag,
 	int			flags)
 {
-	int			error;
-
 	/* skip quota inodes */
 	if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
 	    ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
 		ASSERT(ip->i_udquot == NULL);
 		ASSERT(ip->i_gdquot == NULL);
-		read_unlock(&pag->pag_ici_lock);
 		return 0;
 	}
 
-	error = xfs_sync_inode_valid(ip, pag);
-	if (error)
-		return error;
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
 		xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +893,6 @@ xfs_dqrele_inode(
 		ip->i_gdquot = NULL;
 	}
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	IRELE(ip);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 78ae5256768b91f25ce7a4eb9f56d563e302cc10 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 28 Sep 2010 12:28:19 +1000
Subject: xfs: implement batched inode lookups for AG walking

With the reclaim code separated from the generic walking code, it is
simple to implement batched lookups for the generic walk code.
Separate out the inode validation from the execute operations and
modify the tree lookups to get a batch of inodes at a time.

Reclaim operations will be optimised separately.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 66 ++++++++++++++++++++++++++++++---------------
 fs/xfs/linux-2.6/xfs_sync.h |  2 +-
 2 files changed, 45 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37fc2c0a4d25..0ed3d0ae3c28 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,6 +39,14 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
 STATIC int
 xfs_inode_ag_walk_grab(
 	struct xfs_inode	*ip)
@@ -66,7 +74,6 @@ xfs_inode_ag_walk_grab(
 	return 0;
 }
 
-
 STATIC int
 xfs_inode_ag_walk(
 	struct xfs_mount	*mp,
@@ -79,54 +86,69 @@ xfs_inode_ag_walk(
 	int			last_error = 0;
 	int			skipped;
 	int			done;
+	int			nr_found;
 
 restart:
 	done = 0;
 	skipped = 0;
 	first_index = 0;
+	nr_found = 0;
 	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 		int		error = 0;
-		int		nr_found;
-		xfs_inode_t	*ip;
+		int		i;
 
 		read_lock(&pag->pag_ici_lock);
 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-				(void **)&ip, first_index, 1);
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
 		if (!nr_found) {
 			read_unlock(&pag->pag_ici_lock);
 			break;
 		}
 
 		/*
-		 * Update the index for the next lookup. Catch overflows
-		 * into the next AG range which can occur if we have inodes
-		 * in the last block of the AG and we are currently
-		 * pointing to the last inode.
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
 		 */
-		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-			done = 1;
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
 
-		if (xfs_inode_ag_walk_grab(ip)) {
-			read_unlock(&pag->pag_ici_lock);
-			continue;
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch overflows
+			 * into the next AG range which can occur if we have inodes
+			 * in the last block of the AG and we are currently
+			 * pointing to the last inode.
+			 */
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
 		}
+
+		/* unlock now we've grabbed the inodes. */
 		read_unlock(&pag->pag_ici_lock);
 
-		error = execute(ip, pag, flags);
-		IRELE(ip);
-		if (error == EAGAIN) {
-			skipped++;
-			continue;
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], pag, flags);
+			IRELE(batch[i]);
+			if (error == EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
 		}
-		if (error)
-			last_error = error;
 
 		/* bail out if the filesystem is corrupted.  */
 		if (error == EFSCORRUPTED)
 			break;
 
-	} while (!done);
+	} while (nr_found && !done);
 
 	if (skipped) {
 		delay(1);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index e8a352896d20..32ba6628290c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,7 +47,7 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 				struct xfs_inode *ip);
 
-int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
 	int flags);
-- 
cgit v1.2.3


From e3a20c0b02e1704ab115dfa9d012caf0fbc45ed0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 19:51:50 +1000
Subject: xfs: batch inode reclaim lookup

Batch and optimise the per-ag inode lookup for reclaim to minimise
scanning overhead. This involves gang lookups on the radix trees to
get multiple inodes during each tree walk, and tighter validation of
what inodes can be reclaimed without blocking befor we take any
locks.

This is based on ideas suggested in a proof-of-concept patch
posted by Nick Piggin.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 110 +++++++++++++++++++++++++++++++-------------
 1 file changed, 77 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 0ed3d0ae3c28..754bc591a247 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -630,6 +630,43 @@ __xfs_inode_clear_reclaim_tag(
 	__xfs_inode_clear_reclaim(pag, ip);
 }
 
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+
+	/*
+	 * do some unlocked checks first to avoid unnecceary lock traffic.
+	 * The first is a flush lock check, the second is a already in reclaim
+	 * check. Only do these checks if we are not going to block on locks.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+		return 1;
+	}
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* ignore as it is already under reclaim */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
 /*
  * Inodes in different states need to be treated differently, and the return
  * value of xfs_iflush is not sufficient to get this right. The following table
@@ -688,23 +725,6 @@ xfs_reclaim_inode(
 {
 	int	error = 0;
 
-	/*
-	 * The radix tree lock here protects a thread in xfs_iget from racing
-	 * with us starting reclaim on the inode.  Once we have the
-	 * XFS_IRECLAIM flag set it will not touch us.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* ignore as it is already under reclaim */
-		spin_unlock(&ip->i_flags_lock);
-		write_unlock(&pag->pag_ici_lock);
-		return 0;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	write_unlock(&pag->pag_ici_lock);
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (!xfs_iflock_nowait(ip)) {
 		if (!(sync_mode & SYNC_WAIT))
@@ -822,16 +842,19 @@ xfs_reclaim_inodes_ag(
 	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
 		unsigned long	first_index = 0;
 		int		done = 0;
+		int		nr_found = 0;
 
 		ag = pag->pag_agno + 1;
 
 		do {
-			struct xfs_inode *ip;
-			int	nr_found;
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
 
 			write_lock(&pag->pag_ici_lock);
-			nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
-					(void **)&ip, first_index, 1,
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
 					XFS_ICI_RECLAIM_TAG);
 			if (!nr_found) {
 				write_unlock(&pag->pag_ici_lock);
@@ -839,20 +862,41 @@ xfs_reclaim_inodes_ag(
 			}
 
 			/*
-			 * Update the index for the next lookup. Catch overflows
-			 * into the next AG range which can occur if we have inodes
-			 * in the last block of the AG and we are currently
-			 * pointing to the last inode.
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
 			 */
-			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-				done = 1;
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 */
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
 
-			error = xfs_reclaim_inode(ip, pag, flags);
-			if (error && last_error != EFSCORRUPTED)
-				last_error = error;
+			/* unlock now we've grabbed the inodes. */
+			write_unlock(&pag->pag_ici_lock);
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
 
-		} while (!done && (*nr_to_scan)--);
+		} while (nr_found && !done && *nr_to_scan > 0);
 
 		xfs_perag_put(pag);
 	}
@@ -888,7 +932,7 @@ xfs_reclaim_inode_shrink(
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 
-		xfs_reclaim_inodes_ag(mp, 0, &nr_to_scan);
+		xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
 		/* terminate if we don't exhaust the scan */
 		if (nr_to_scan > 0)
 			return -1;
-- 
cgit v1.2.3


From 69b491c214d7fd4d4df972ae5377be99ca3753db Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 27 Sep 2010 11:09:51 +1000
Subject: xfs: serialise inode reclaim within an AG

Memory reclaim via shrinkers has a terrible habit of having N+M
concurrent shrinker executions (N = num CPUs, M = num kswapds) all
trying to shrink the same cache. When the cache they are all working
on is protected by a single spinlock, massive contention an
slowdowns occur.

Wrap the per-ag inode caches with a reclaim mutex to serialise
reclaim access to the AG. This will block concurrent reclaim in each
AG but still allow reclaim to scan multiple AGs concurrently. Allow
shrinkers to move on to the next AG if it can't get the lock, and if
we can't get any AG, then start blocking on locks.

To prevent reclaimers from continually scanning the same inodes in
each AG, add a cursor that tracks where the last reclaim got up to
and start from that point on the next reclaim. This should avoid
only ever scanning a small number of inodes at the satart of each AG
and not making progress. If we have a non-shrinker based reclaim
pass, ignore the cursor and reset it to zero once we are done.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 30 ++++++++++++++++++++++++++++++
 fs/xfs/xfs_ag.h             |  2 ++
 fs/xfs/xfs_mount.c          |  1 +
 3 files changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 754bc591a247..37d33254981d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -837,8 +837,12 @@ xfs_reclaim_inodes_ag(
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
 
+restart:
 	ag = 0;
+	skipped = 0;
 	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
 		unsigned long	first_index = 0;
 		int		done = 0;
@@ -846,6 +850,15 @@ xfs_reclaim_inodes_ag(
 
 		ag = pag->pag_agno + 1;
 
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
 		do {
 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 			int	i;
@@ -898,8 +911,25 @@ xfs_reclaim_inodes_ag(
 
 		} while (nr_found && !done && *nr_to_scan > 0);
 
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
 		xfs_perag_put(pag);
 	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (trylock && skipped && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
 	return XFS_ERROR(last_error);
 }
 
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 51c42c202bf1..baeec83d01f9 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,8 @@ typedef struct xfs_perag {
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
+	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
+	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d66e87c7c3a6..59859c343e04 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -477,6 +477,7 @@ xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 		rwlock_init(&pag->pag_ici_lock);
+		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 
 		if (radix_tree_preload(GFP_NOFS))
-- 
cgit v1.2.3


From 74f75a0cb7033918eb0fa4a50df25091ac75c16e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 19:59:04 +1000
Subject: xfs: convert buffer cache hash to rbtree

The buffer cache hash is showing typical hash scalability problems.
In large scale testing the number of cached items growing far larger
than the hash can efficiently handle. Hence we need to move to a
self-scaling cache indexing mechanism.

I have selected rbtrees for indexing becuse they can have O(log n)
search scalability, and insert and remove cost is not excessive,
even on large trees. Hence we should be able to cache large numbers
of buffers without incurring the excessive cache miss search
penalties that the hash is imposing on us.

To ensure we still have parallel access to the cache, we need
multiple trees. Rather than hashing the buffers by disk address to
select a tree, it seems more sensible to separate trees by typical
access patterns. Most operations use buffers from within a single AG
at a time, so rather than searching lots of different lists,
separate the buffer indexes out into per-AG rbtrees. This means that
searches during metadata operation have a much higher chance of
hitting cache resident nodes, and that updates of the tree are less
likely to disturb trees being accessed on other CPUs doing
independent operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 136 ++++++++++++++++++++++-----------------------
 fs/xfs/linux-2.6/xfs_buf.h |   8 +--
 fs/xfs/xfs_ag.h            |   4 ++
 fs/xfs/xfs_mount.c         |   2 +
 4 files changed, 74 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 251bcdc6352e..749d7d39d657 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,7 +188,7 @@ _xfs_buf_initialize(
 	atomic_set(&bp->b_hold, 1);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_list);
-	INIT_LIST_HEAD(&bp->b_hash_list);
+	RB_CLEAR_NODE(&bp->b_rbnode);
 	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
 	XB_SET_OWNER(bp);
 	bp->b_target = target;
@@ -262,8 +262,6 @@ xfs_buf_free(
 {
 	trace_xfs_buf_free(bp, _RET_IP_);
 
-	ASSERT(list_empty(&bp->b_hash_list));
-
 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 		uint		i;
 
@@ -422,8 +420,10 @@ _xfs_buf_find(
 {
 	xfs_off_t		range_base;
 	size_t			range_length;
-	xfs_bufhash_t		*hash;
-	xfs_buf_t		*bp, *n;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	xfs_buf_t		*bp;
 
 	range_base = (ioff << BBSHIFT);
 	range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
 	ASSERT(!(range_length < (1 << btp->bt_sshift)));
 	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 
-	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-	spin_lock(&hash->bh_lock);
-
-	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-		ASSERT(btp == bp->b_target);
-		if (bp->b_file_offset == range_base &&
-		    bp->b_buffer_length == range_length) {
+	/* get tree root */
+	pag = xfs_perag_get(btp->bt_mount,
+				xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+	/* walk tree */
+	spin_lock(&pag->pag_buf_lock);
+	rbp = &pag->pag_buf_tree.rb_node;
+	parent = NULL;
+	bp = NULL;
+	while (*rbp) {
+		parent = *rbp;
+		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+		if (range_base < bp->b_file_offset)
+			rbp = &(*rbp)->rb_left;
+		else if (range_base > bp->b_file_offset)
+			rbp = &(*rbp)->rb_right;
+		else {
+			/*
+			 * found a block offset match. If the range doesn't
+			 * match, the only way this is allowed is if the buffer
+			 * in the cache is stale and the transaction that made
+			 * it stale has not yet committed. i.e. we are
+			 * reallocating a busy extent. Skip this buffer and
+			 * continue searching to the right for an exact match.
+			 */
+			if (bp->b_buffer_length != range_length) {
+				ASSERT(bp->b_flags & XBF_STALE);
+				rbp = &(*rbp)->rb_right;
+				continue;
+			}
 			atomic_inc(&bp->b_hold);
 			goto found;
 		}
@@ -449,17 +472,21 @@ _xfs_buf_find(
 	if (new_bp) {
 		_xfs_buf_initialize(new_bp, btp, range_base,
 				range_length, flags);
-		new_bp->b_hash = hash;
-		list_add(&new_bp->b_hash_list, &hash->bh_list);
+		rb_link_node(&new_bp->b_rbnode, parent, rbp);
+		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+		/* the buffer keeps the perag reference until it is freed */
+		new_bp->b_pag = pag;
+		spin_unlock(&pag->pag_buf_lock);
 	} else {
 		XFS_STATS_INC(xb_miss_locked);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
 	}
-
-	spin_unlock(&hash->bh_lock);
 	return new_bp;
 
 found:
-	spin_unlock(&hash->bh_lock);
+	spin_unlock(&pag->pag_buf_lock);
+	xfs_perag_put(pag);
 
 	/* Attempt to get the semaphore without sleeping,
 	 * if this does not work then we need to drop the
@@ -809,27 +836,30 @@ void
 xfs_buf_rele(
 	xfs_buf_t		*bp)
 {
-	xfs_bufhash_t		*hash = bp->b_hash;
+	struct xfs_perag	*pag = bp->b_pag;
 
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	if (unlikely(!hash)) {
+	if (!pag) {
 		ASSERT(!bp->b_relse);
+		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold))
 			xfs_buf_free(bp);
 		return;
 	}
 
+	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
 	ASSERT(atomic_read(&bp->b_hold) > 0);
-	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
 		if (bp->b_relse) {
 			atomic_inc(&bp->b_hold);
-			spin_unlock(&hash->bh_lock);
-			(*(bp->b_relse)) (bp);
+			spin_unlock(&pag->pag_buf_lock);
+			bp->b_relse(bp);
 		} else {
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-			list_del_init(&bp->b_hash_list);
-			spin_unlock(&hash->bh_lock);
+			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+			spin_unlock(&pag->pag_buf_lock);
+			xfs_perag_put(pag);
 			xfs_buf_free(bp);
 		}
 	}
@@ -1429,56 +1459,24 @@ xfs_buf_iomove(
  */
 void
 xfs_wait_buftarg(
-	xfs_buftarg_t	*btp)
+	struct xfs_buftarg	*btp)
 {
-	xfs_bufhash_t	*hash;
-	uint		i;
+	struct xfs_perag	*pag;
+	uint			i;
 
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		hash = &btp->bt_hash[i];
-		spin_lock(&hash->bh_lock);
-		while (!list_empty(&hash->bh_list)) {
-			spin_unlock(&hash->bh_lock);
+	for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+		pag = xfs_perag_get(btp->bt_mount, i);
+		spin_lock(&pag->pag_buf_lock);
+		while (rb_first(&pag->pag_buf_tree)) {
+			spin_unlock(&pag->pag_buf_lock);
 			delay(100);
-			spin_lock(&hash->bh_lock);
+			spin_lock(&pag->pag_buf_lock);
 		}
-		spin_unlock(&hash->bh_lock);
-	}
-}
-
-/*
- *	Allocate buffer hash table for a given target.
- *	For devices containing metadata (i.e. not the log/realtime devices)
- *	we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-	xfs_buftarg_t		*btp,
-	int			external)
-{
-	unsigned int		i;
-
-	if (external) {
-		btp->bt_hash = NULL;
-		return;
-	}
-	btp->bt_hashshift = 12;	/* 4096 buckets */
-	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-					 sizeof(xfs_bufhash_t));
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		spin_lock_init(&btp->bt_hash[i].bh_lock);
-		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
 	}
 }
 
-STATIC void
-xfs_free_bufhash(
-	xfs_buftarg_t		*btp)
-{
-	kmem_free_large(btp->bt_hash);
-	btp->bt_hash = NULL;
-}
-
 /*
  *	buftarg list for delwrite queue processing
  */
@@ -1511,7 +1509,6 @@ xfs_free_buftarg(
 	xfs_flush_buftarg(btp, 1);
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
-	xfs_free_bufhash(btp);
 	iput(btp->bt_mapping->host);
 
 	/* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1648,6 @@ xfs_alloc_buftarg(
 		goto error;
 	if (xfs_alloc_delwrite_queue(btp, fsname))
 		goto error;
-	xfs_alloc_bufhash(btp, external);
 	return btp;
 
 error:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 1f109cee136c..c79882ecb513 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -135,10 +135,6 @@ typedef struct xfs_buftarg {
 	unsigned int		bt_sshift;
 	size_t			bt_smask;
 
-	/* per device buffer hash table */
-	uint			bt_hashshift;
-	xfs_bufhash_t		*bt_hash;
-
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
 	struct list_head	bt_list;
@@ -172,8 +168,8 @@ typedef struct xfs_buf {
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	xfs_buf_flags_t		b_flags;	/* status flags */
-	struct list_head	b_hash_list;	/* hash table list */
-	xfs_bufhash_t		*b_hash;	/* hash table list start */
+	struct rb_node		b_rbnode;	/* rbtree node */
+	struct xfs_perag	*b_pag;		/* contains rbtree root */
 	xfs_buftarg_t		*b_target;	/* buffer target (device) */
 	atomic_t		b_hold;		/* reference count */
 	xfs_daddr_t		b_bn;		/* block number for I/O */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index baeec83d01f9..63c7a1a6c022 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -233,6 +233,10 @@ typedef struct xfs_perag {
 	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
+	/* buffer cache index */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
+	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
 #endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 59859c343e04..cfa2fb4e7f97 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -479,6 +479,8 @@ xfs_initialize_perag(
 		rwlock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		spin_lock_init(&pag->pag_buf_lock);
+		pag->pag_buf_tree = RB_ROOT;
 
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
-- 
cgit v1.2.3


From 50f59e8eed85ec4c79bc2454ed50c7886f6c5ebf Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 24 Sep 2010 19:59:15 +1000
Subject: xfs: pack xfs_buf structure more tightly

pahole reports the struct xfs_buf has quite a few holes in it, so
packing the structure better will reduce the size of it by 16 bytes.
Also, move all the fields used in cache lookups into the first
cacheline.

Before on x86_64:

        /* size: 320, cachelines: 5 */
	/* sum members: 298, holes: 6, sum holes: 22 */

After on x86_64:

        /* size: 304, cachelines: 5 */
	/* padding: 6 */
	/* last cacheline: 48 bytes */

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.h | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index c79882ecb513..161333785f69 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -162,33 +162,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES	2
 
 typedef struct xfs_buf {
+	/*
+	 * first cacheline holds all the fields needed for an uncontended cache
+	 * hit to be fully processed. The semaphore straddles the cacheline
+	 * boundary, but the counter and lock sits on the first cacheline,
+	 * which is the only bit that is touched if we hit the semaphore
+	 * fast-path on locking.
+	 */
+	struct rb_node		b_rbnode;	/* rbtree node */
+	xfs_off_t		b_file_offset;	/* offset in file */
+	size_t			b_buffer_length;/* size of buffer in bytes */
+	atomic_t		b_hold;		/* reference count */
+	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
-	unsigned long		b_queuetime;	/* time buffer was queued */
-	atomic_t		b_pin_count;	/* pin count */
+
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
-	xfs_buf_flags_t		b_flags;	/* status flags */
-	struct rb_node		b_rbnode;	/* rbtree node */
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
 	xfs_buftarg_t		*b_target;	/* buffer target (device) */
-	atomic_t		b_hold;		/* reference count */
 	xfs_daddr_t		b_bn;		/* block number for I/O */
-	xfs_off_t		b_file_offset;	/* offset in file */
-	size_t			b_buffer_length;/* size of buffer in bytes */
 	size_t			b_count_desired;/* desired transfer size */
 	void			*b_addr;	/* virtual address of buffer */
 	struct work_struct	b_iodone_work;
-	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
 	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
-	unsigned short		b_error;	/* error code on I/O */
-	unsigned int		b_page_count;	/* size of page array */
-	unsigned int		b_offset;	/* page offset in first page */
 	struct page		**b_pages;	/* array of page pointers */
 	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+	unsigned long		b_queuetime;	/* time buffer was queued */
+	atomic_t		b_pin_count;	/* pin count */
+	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
+	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_offset;	/* page offset in first page */
+	unsigned short		b_error;	/* error code on I/O */
 #ifdef XFS_BUF_LOCK_TRACKING
 	int			b_last_holder;
 #endif
-- 
cgit v1.2.3


From 61ba35dea0593fbc8d062cab3e4c4c3da5ce7104 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 30 Sep 2010 02:25:54 +0000
Subject: xfs: remove XFS_MOUNT_NO_PERCPU_SB

Fail the mount if we can't allocate memory for the per-CPU counters.
This is consistent with how we handle everything else in the mount
path and makes the superblock counter modification a lot simpler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  6 ++++--
 fs/xfs/xfs_mount.c           | 40 +++++++++++++++-------------------------
 fs/xfs/xfs_mount.h           |  2 --
 3 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4759be4daa26..a316c7316702 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1518,8 +1518,9 @@ xfs_fs_fill_super(
 	if (error)
 		goto out_free_fsname;
 
-	if (xfs_icsb_init_counters(mp))
-		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+	error = xfs_icsb_init_counters(mp);
+	if (error)
+		goto out_close_devices;
 
 	error = xfs_readsb(mp, flags);
 	if (error)
@@ -1580,6 +1581,7 @@ xfs_fs_fill_super(
 	xfs_freesb(mp);
  out_destroy_counters:
 	xfs_icsb_destroy_counters(mp);
+ out_close_devices:
 	xfs_close_devices(mp);
  out_free_fsname:
 	xfs_free_fsname(mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index cfa2fb4e7f97..55de83585e00 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1856,12 +1856,8 @@ xfs_mod_incore_sb(
 	case XFS_SBS_ICOUNT:
 	case XFS_SBS_IFREE:
 	case XFS_SBS_FDBLOCKS:
-		if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-			status = xfs_icsb_modify_counters(mp, field,
-							delta, rsvd);
-			break;
-		}
-		/* FALLTHROUGH */
+		status = xfs_icsb_modify_counters(mp, field, delta, rsvd);
+		break;
 #endif
 	default:
 		spin_lock(&mp->m_sb_lock);
@@ -1910,15 +1906,12 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
 		case XFS_SBS_ICOUNT:
 		case XFS_SBS_IFREE:
 		case XFS_SBS_FDBLOCKS:
-			if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-				spin_unlock(&mp->m_sb_lock);
-				status = xfs_icsb_modify_counters(mp,
-							msbp->msb_field,
-							msbp->msb_delta, rsvd);
-				spin_lock(&mp->m_sb_lock);
-				break;
-			}
-			/* FALLTHROUGH */
+			spin_unlock(&mp->m_sb_lock);
+			status = xfs_icsb_modify_counters(mp,
+						msbp->msb_field,
+						msbp->msb_delta, rsvd);
+			spin_lock(&mp->m_sb_lock);
+			break;
 #endif
 		default:
 			status = xfs_mod_incore_sb_unlocked(mp,
@@ -1948,16 +1941,13 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
 			case XFS_SBS_ICOUNT:
 			case XFS_SBS_IFREE:
 			case XFS_SBS_FDBLOCKS:
-				if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-					spin_unlock(&mp->m_sb_lock);
-					status = xfs_icsb_modify_counters(mp,
-							msbp->msb_field,
-							-(msbp->msb_delta),
-							rsvd);
-					spin_lock(&mp->m_sb_lock);
-					break;
-				}
-				/* FALLTHROUGH */
+				spin_unlock(&mp->m_sb_lock);
+				status = xfs_icsb_modify_counters(mp,
+						msbp->msb_field,
+						-(msbp->msb_delta),
+						rsvd);
+				spin_lock(&mp->m_sb_lock);
+				break;
 #endif
 			default:
 				status = xfs_mod_incore_sb_unlocked(mp,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7ab240930ba5..a9d366e7656a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -232,8 +232,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_DIRSYNC	(1ULL << 21)	/* synchronous directory ops */
 #define XFS_MOUNT_COMPAT_IOSIZE	(1ULL << 22)	/* don't report large preferred
 						 * I/O size in stat() */
-#define XFS_MOUNT_NO_PERCPU_SB	(1ULL << 23)	/* don't use per-cpu superblock
-						   counters */
 #define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
 						   allocator */
 #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
-- 
cgit v1.2.3


From 96540c78583a417113df4d027e6b68a595ab9a09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 30 Sep 2010 02:25:55 +0000
Subject: xfs: do not use xfs_mod_incore_sb for per-cpu counters

Export xfs_icsb_modify_counters and always use it for modifying
the per-cpu counters.  Remove support for per-cpu counters from
xfs_mod_incore_sb to simplify it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c  | 32 ++++++++++++++++++--------------
 fs/xfs/xfs_fsops.c |  3 ++-
 fs/xfs/xfs_mount.c | 32 +++++++++-----------------------
 fs/xfs/xfs_mount.h |  4 ++++
 fs/xfs/xfs_trans.c |  4 ++--
 5 files changed, 35 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5e33b7862d41..8abd12e32e13 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -614,7 +614,7 @@ xfs_bmap_add_extent(
 			nblks += cur->bc_private.b.allocated;
 		ASSERT(nblks <= da_old);
 		if (nblks < da_old)
-			xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
 				(int64_t)(da_old - nblks), rsvd);
 	}
 	/*
@@ -1079,7 +1079,8 @@ xfs_bmap_add_extent_delay_real(
 		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		if (diff > 0 &&
-		    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
+		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+					     -((int64_t)diff), rsvd)) {
 			/*
 			 * Ick gross gag me with a spoon.
 			 */
@@ -1089,16 +1090,18 @@ xfs_bmap_add_extent_delay_real(
 					temp--;
 					diff--;
 					if (!diff ||
-					    !xfs_mod_incore_sb(ip->i_mount,
-						    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+					    !xfs_icsb_modify_counters(ip->i_mount,
+						    XFS_SBS_FDBLOCKS,
+						    -((int64_t)diff), rsvd))
 						break;
 				}
 				if (temp2) {
 					temp2--;
 					diff--;
 					if (!diff ||
-					    !xfs_mod_incore_sb(ip->i_mount,
-						    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+					    !xfs_icsb_modify_counters(ip->i_mount,
+						    XFS_SBS_FDBLOCKS,
+						    -((int64_t)diff), rsvd))
 						break;
 				}
 			}
@@ -1766,7 +1769,7 @@ xfs_bmap_add_extent_hole_delay(
 	}
 	if (oldlen != newlen) {
 		ASSERT(oldlen > newlen);
-		xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
 			(int64_t)(oldlen - newlen), rsvd);
 		/*
 		 * Nothing to do for disk quota accounting here.
@@ -3111,9 +3114,10 @@ xfs_bmap_del_extent(
 	 * Nothing to do for disk quota accounting here.
 	 */
 	ASSERT(da_old >= da_new);
-	if (da_old > da_new)
-		xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
-			rsvd);
+	if (da_old > da_new) {
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+			(int64_t)(da_old - da_new), rsvd);
+	}
 done:
 	*logflagsp = flags;
 	return error;
@@ -4526,13 +4530,13 @@ xfs_bmapi(
 							-((int64_t)extsz), (flags &
 							XFS_BMAPI_RSVBLOCKS));
 				} else {
-					error = xfs_mod_incore_sb(mp,
+					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
 							-((int64_t)alen), (flags &
 							XFS_BMAPI_RSVBLOCKS));
 				}
 				if (!error) {
-					error = xfs_mod_incore_sb(mp,
+					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
 							-((int64_t)indlen), (flags &
 							XFS_BMAPI_RSVBLOCKS));
@@ -4542,7 +4546,7 @@ xfs_bmapi(
 							(int64_t)extsz, (flags &
 							XFS_BMAPI_RSVBLOCKS));
 					else if (error)
-						xfs_mod_incore_sb(mp,
+						xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
 							(int64_t)alen, (flags &
 							XFS_BMAPI_RSVBLOCKS));
@@ -5206,7 +5210,7 @@ xfs_bunmapi(
 					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_RTBLKS);
 			} else {
-				xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
 						(int64_t)del.br_blockcount, rsvd);
 				(void)xfs_trans_reserve_quota_nblks(NULL,
 					ip, -((long)del.br_blockcount), 0,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 6a1edb1348f6..a7c116e814af 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -596,7 +596,8 @@ out:
 		 * the extra reserve blocks from the reserve.....
 		 */
 		int error;
-		error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0);
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 fdblks_delta, 0);
 		if (error == ENOSPC)
 			goto retry;
 	}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 55de83585e00..20c67ff57536 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 						int);
 STATIC void	xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
 						int);
-STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
-						int64_t, int);
 STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-
 #else
 
 #define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)	do { } while (0)
-#define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
-
 #endif
 
 static const struct {
@@ -1843,28 +1838,19 @@ xfs_mod_incore_sb_unlocked(
  */
 int
 xfs_mod_incore_sb(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t	field,
-	int64_t		delta,
-	int		rsvd)
+	struct xfs_mount	*mp,
+	xfs_sb_field_t		field,
+	int64_t			delta,
+	int			rsvd)
 {
-	int	status;
+	int			status;
 
-	/* check for per-cpu counters */
-	switch (field) {
 #ifdef HAVE_PERCPU_SB
-	case XFS_SBS_ICOUNT:
-	case XFS_SBS_IFREE:
-	case XFS_SBS_FDBLOCKS:
-		status = xfs_icsb_modify_counters(mp, field, delta, rsvd);
-		break;
+	ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
 #endif
-	default:
-		spin_lock(&mp->m_sb_lock);
-		status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-		spin_unlock(&mp->m_sb_lock);
-		break;
-	}
+	spin_lock(&mp->m_sb_lock);
+	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+	spin_unlock(&mp->m_sb_lock);
 
 	return status;
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a9d366e7656a..6509c0743957 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -91,6 +91,8 @@ extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
 extern void	xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void	xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+						int64_t, int);
 
 #else
 #define xfs_icsb_init_counters(mp)		(0)
@@ -98,6 +100,8 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_reinit_counters(mp)		do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)	do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+	xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
 
 typedef struct xfs_mount {
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..73cf8f45725e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -696,7 +696,7 @@ xfs_trans_reserve(
 	 * fail if the count would go below zero.
 	 */
 	if (blocks > 0) {
-		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+		error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
 					  -((int64_t)blocks), rsvd);
 		if (error != 0) {
 			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +767,7 @@ undo_log:
 
 undo_blocks:
 	if (blocks > 0) {
-		(void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+		xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
 					 (int64_t)blocks, rsvd);
 		tp->t_blk_res = 0;
 	}
-- 
cgit v1.2.3


From 1b0407125f9a5be63e861eb27c8af9e32f20619c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 30 Sep 2010 02:25:56 +0000
Subject: xfs: do not use xfs_mod_incore_sb_batch for per-cpu counters

Update the per-cpu counters manually in xfs_trans_unreserve_and_mod_sb
and remove support for per-cpu counters from xfs_mod_incore_sb_batch
to simplify it.  And added benefit is that we don't have to take
m_sb_lock for transactions that only modify per-cpu counters.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 114 ++++++++++++++++-------------------------------------
 fs/xfs/xfs_trans.c |  78 +++++++++++++++++++++++-------------
 2 files changed, 85 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 20c67ff57536..00538b7e694a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1856,98 +1856,54 @@ xfs_mod_incore_sb(
 }
 
 /*
- * xfs_mod_incore_sb_batch() is used to change more than one field
- * in the in-core superblock structure at a time.  This modification
- * is protected by a lock internal to this module.  The fields and
- * changes to those fields are specified in the array of xfs_mod_sb
- * structures passed in.
+ * Change more than one field in the in-core superblock structure at a time.
  *
- * Either all of the specified deltas will be applied or none of
- * them will.  If any modified field dips below 0, then all modifications
- * will be backed out and EINVAL will be returned.
+ * The fields and changes to those fields are specified in the array of
+ * xfs_mod_sb structures passed in.  Either all of the specified deltas
+ * will be applied or none of them will.  If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
+ *
+ * Note that this function may not be used for the superblock values that
+ * are tracked with the in-memory per-cpu counters - a direct call to
+ * xfs_icsb_modify_counters is required for these.
  */
 int
-xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+xfs_mod_incore_sb_batch(
+	struct xfs_mount	*mp,
+	xfs_mod_sb_t		*msb,
+	uint			nmsb,
+	int			rsvd)
 {
-	int		status=0;
-	xfs_mod_sb_t	*msbp;
+	xfs_mod_sb_t		*msbp = &msb[0];
+	int			error = 0;
 
 	/*
-	 * Loop through the array of mod structures and apply each
-	 * individually.  If any fail, then back out all those
-	 * which have already been applied.  Do all of this within
-	 * the scope of the m_sb_lock so that all of the changes will
-	 * be atomic.
+	 * Loop through the array of mod structures and apply each individually.
+	 * If any fail, then back out all those which have already been applied.
+	 * Do all of this within the scope of the m_sb_lock so that all of the
+	 * changes will be atomic.
 	 */
 	spin_lock(&mp->m_sb_lock);
-	msbp = &msb[0];
 	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
-		/*
-		 * Apply the delta at index n.  If it fails, break
-		 * from the loop so we'll fall into the undo loop
-		 * below.
-		 */
-		switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-		case XFS_SBS_ICOUNT:
-		case XFS_SBS_IFREE:
-		case XFS_SBS_FDBLOCKS:
-			spin_unlock(&mp->m_sb_lock);
-			status = xfs_icsb_modify_counters(mp,
-						msbp->msb_field,
-						msbp->msb_delta, rsvd);
-			spin_lock(&mp->m_sb_lock);
-			break;
-#endif
-		default:
-			status = xfs_mod_incore_sb_unlocked(mp,
-						msbp->msb_field,
-						msbp->msb_delta, rsvd);
-			break;
-		}
+		ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
+		       msbp->msb_field > XFS_SBS_FDBLOCKS);
 
-		if (status != 0) {
-			break;
-		}
+		error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						   msbp->msb_delta, rsvd);
+		if (error)
+			goto unwind;
 	}
+	spin_unlock(&mp->m_sb_lock);
+	return 0;
 
-	/*
-	 * If we didn't complete the loop above, then back out
-	 * any changes made to the superblock.  If you add code
-	 * between the loop above and here, make sure that you
-	 * preserve the value of status. Loop back until
-	 * we step below the beginning of the array.  Make sure
-	 * we don't touch anything back there.
-	 */
-	if (status != 0) {
-		msbp--;
-		while (msbp >= msb) {
-			switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-			case XFS_SBS_ICOUNT:
-			case XFS_SBS_IFREE:
-			case XFS_SBS_FDBLOCKS:
-				spin_unlock(&mp->m_sb_lock);
-				status = xfs_icsb_modify_counters(mp,
-						msbp->msb_field,
-						-(msbp->msb_delta),
-						rsvd);
-				spin_lock(&mp->m_sb_lock);
-				break;
-#endif
-			default:
-				status = xfs_mod_incore_sb_unlocked(mp,
-							msbp->msb_field,
-							-(msbp->msb_delta),
-							rsvd);
-				break;
-			}
-			ASSERT(status == 0);
-			msbp--;
-		}
+unwind:
+	while (--msbp >= msb) {
+		error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						   -msbp->msb_delta, rsvd);
+		ASSERT(error == 0);
 	}
 	spin_unlock(&mp->m_sb_lock);
-	return status;
+	return error;
 }
 
 /*
@@ -2478,7 +2434,7 @@ xfs_icsb_balance_counter(
 	spin_unlock(&mp->m_sb_lock);
 }
 
-STATIC int
+int
 xfs_icsb_modify_counters(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t	field,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 73cf8f45725e..5fab0e6bf86e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1009,7 +1009,7 @@ void
 xfs_trans_unreserve_and_mod_sb(
 	xfs_trans_t	*tp)
 {
-	xfs_mod_sb_t	msb[14];	/* If you add cases, add entries */
+	xfs_mod_sb_t	msb[9];	/* If you add cases, add entries */
 	xfs_mod_sb_t	*msbp;
 	xfs_mount_t	*mp = tp->t_mountp;
 	/* REFERENCED */
@@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
 	int		rsvd;
 	int64_t		blkdelta = 0;
 	int64_t		rtxdelta = 0;
+	int64_t		idelta = 0;
+	int64_t		ifreedelta = 0;
 
 	msbp = msb;
 	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
-	/* calculate free blocks delta */
+	/* calculate deltas */
 	if (tp->t_blk_res > 0)
 		blkdelta = tp->t_blk_res;
-
 	if ((tp->t_fdblocks_delta != 0) &&
 	    (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
 	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
 	        blkdelta += tp->t_fdblocks_delta;
 
-	if (blkdelta != 0) {
-		msbp->msb_field = XFS_SBS_FDBLOCKS;
-		msbp->msb_delta = blkdelta;
-		msbp++;
-	}
-
-	/* calculate free realtime extents delta */
 	if (tp->t_rtx_res > 0)
 		rtxdelta = tp->t_rtx_res;
-
 	if ((tp->t_frextents_delta != 0) &&
 	    (tp->t_flags & XFS_TRANS_SB_DIRTY))
 		rtxdelta += tp->t_frextents_delta;
 
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+		idelta = tp->t_icount_delta;
+		ifreedelta = tp->t_ifree_delta;
+	}
+
+	/* apply the per-cpu counters */
+	if (blkdelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 blkdelta, rsvd);
+		if (error)
+			goto out;
+	}
+
+	if (idelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+						 idelta, rsvd);
+		if (error)
+			goto out_undo_fdblocks;
+	}
+
+	if (ifreedelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+						 ifreedelta, rsvd);
+		if (error)
+			goto out_undo_icount;
+	}
+
+	/* apply remaining deltas */
 	if (rtxdelta != 0) {
 		msbp->msb_field = XFS_SBS_FREXTENTS;
 		msbp->msb_delta = rtxdelta;
 		msbp++;
 	}
 
-	/* apply remaining deltas */
-
-	if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
-	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
-		if (tp->t_icount_delta != 0) {
-			msbp->msb_field = XFS_SBS_ICOUNT;
-			msbp->msb_delta = tp->t_icount_delta;
-			msbp++;
-		}
-		if (tp->t_ifree_delta != 0) {
-			msbp->msb_field = XFS_SBS_IFREE;
-			msbp->msb_delta = tp->t_ifree_delta;
-			msbp++;
-		}
-	}
-
 	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
 		if (tp->t_dblocks_delta != 0) {
 			msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb(
 	if (msbp > msb) {
 		error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
 			(uint)(msbp - msb), rsvd);
-		ASSERT(error == 0);
+		if (error)
+			goto out_undo_ifreecount;
 	}
+
+	return;
+
+out_undo_ifreecount:
+	if (ifreedelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+	if (idelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+	if (blkdelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+	ASSERT(error = 0);
+	return;
 }
 
 /*
-- 
cgit v1.2.3


From d276734d937a649ff43fd197d0df7a747bd55b7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 18:31:23 +0000
Subject: xfs: fix bogus m_maxagi check in xfs_iget

These days inode64 should only control which AGs we allocate new
inodes from, while we still try to support reading all existing
inodes.  To make this actually work the check ontop of xfs_iget
needs to be relaxed to allow inodes in all allocation groups instead
of just those that we allow allocating inodes from.  Note that we
can't simply remove the check - it prevents us from accessing
invalid data when fed invalid inode numbers from NFS or bulkstat.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iget.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..0cdd26932d8e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -365,8 +365,8 @@ xfs_iget(
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
 
-	/* the radix tree exists only in inode capable AGs */
-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+	/* reject inode numbers outside existing AGs */
+	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
-- 
cgit v1.2.3


From dfe188d4283752086d48380cde40d9801c318667 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:12 +0000
Subject: xfs: remove unused t_callback field in struct xfs_trans

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_trans.c | 4 ----
 fs/xfs/xfs_trans.h | 2 --
 2 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5fab0e6bf86e..50bd4acefc00 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1416,10 +1416,6 @@ xfs_trans_committed(
 {
 	struct xfs_log_item_desc *lidp, *next;
 
-	/* Call the transaction's completion callback if there is one. */
-	if (tp->t_callback != NULL)
-		tp->t_callback(tp, tp->t_callarg);
-
 	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
 		xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
 		xfs_trans_free_item_desc(lidp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e2cbe4da7d8e..246286b77a86 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
 						 * transaction. */
 	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
 	struct xfs_dquot_acct   *t_dqinfo;	/* acctg info for dquots */
-	xfs_trans_callback_t	t_callback;	/* transaction callback */
-	void			*t_callarg;	/* callback arg */
 	unsigned int		t_flags;	/* misc flags */
 	int64_t			t_icount_delta;	/* superblock icount change */
 	int64_t			t_ifree_delta;	/* superblock ifree change */
-- 
cgit v1.2.3


From 4957a449a1bce2f5095f57f84114dc038a8f08d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:13 +0000
Subject: xfs: fix the xfs_trans_committed

Use the correct prototype for xfs_trans_committed instead of casting it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_trans.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 50bd4acefc00..f6d956b7711e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1411,9 +1411,10 @@ xfs_trans_item_committed(
  */
 STATIC void
 xfs_trans_committed(
-	struct xfs_trans	*tp,
+	void			*arg,
 	int			abortflag)
 {
+	struct xfs_trans	*tp = arg;
 	struct xfs_log_item_desc *lidp, *next;
 
 	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
@@ -1543,7 +1544,7 @@ xfs_trans_commit_iclog(
 	 * running in simulation mode (the log is explicitly turned
 	 * off).
 	 */
-	tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+	tp->t_logcb.cb_func = xfs_trans_committed;
 	tp->t_logcb.cb_arg = tp;
 
 	/*
-- 
cgit v1.2.3


From 1ae4fe6dba24ebabcd12cd0fa45cc5955394cbd8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:14 +0000
Subject: xfs: remove xfs_refcache.h

This header has been completely unused for a couple of years.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_refcache.h | 52 ---------------------------------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 fs/xfs/xfs_refcache.h

(limited to 'fs')

diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_REFCACHE_H__
-#define __XFS_REFCACHE_H__
-
-#ifdef HAVE_REFCACHE
-/*
- * Maximum size (in inodes) for the NFS reference cache
- */
-#define XFS_REFCACHE_SIZE_MAX	512
-
-struct xfs_inode;
-struct xfs_mount;
-
-extern void xfs_refcache_insert(struct xfs_inode *);
-extern void xfs_refcache_purge_ip(struct xfs_inode *);
-extern void xfs_refcache_purge_mp(struct xfs_mount *);
-extern void xfs_refcache_purge_some(struct xfs_mount *);
-extern void xfs_refcache_resize(int);
-extern void xfs_refcache_destroy(void);
-
-extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
-
-#else
-
-#define xfs_refcache_insert(ip)		do { } while (0)
-#define xfs_refcache_purge_ip(ip)	do { } while (0)
-#define xfs_refcache_purge_mp(mp)	do { } while (0)
-#define xfs_refcache_purge_some(mp)	do { } while (0)
-#define xfs_refcache_resize(size)	do { } while (0)
-#define xfs_refcache_destroy()		do { } while (0)
-
-#define xfs_refcache_iunlock(ip, flags)	xfs_iunlock(ip, flags)
-
-#endif
-
-#endif	/* __XFS_REFCACHE_H__ */
-- 
cgit v1.2.3


From 668332e5fec809bb100da619fda80e033b12b4a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:15 +0000
Subject: xfs: remove xfs_version.h

It used to have a place when it contained an automatically generated
CVS version, but these days it's entirely superflous.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c   |  1 -
 fs/xfs/linux-2.6/xfs_super.h   |  1 +
 fs/xfs/linux-2.6/xfs_version.h | 29 -----------------------------
 3 files changed, 1 insertion(+), 30 deletions(-)
 delete mode 100644 fs/xfs/linux-2.6/xfs_version.h

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a316c7316702..48d5f8206549 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
 # define XFS_DBG_STRING		"no debug"
 #endif
 
+#define XFS_VERSION_STRING	"SGI XFS"
 #define XFS_BUILD_OPTIONS	XFS_ACL_STRING \
 				XFS_SECURITY_STRING \
 				XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VERSION_H__
-#define __XFS_VERSION_H__
-
-/*
- * Dummy file that can contain a timestamp to put into the
- * XFS init string, to help users keep track of what they're
- * running
- */
-
-#define XFS_VERSION_STRING "SGI XFS"
-
-#endif /* __XFS_VERSION_H__ */
-- 
cgit v1.2.3


From 78a4b0961ff241c6f23b16329db0d67e97cb86a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:16 +0000
Subject: xfs: remove xfs_globals.h

This header only provides one extern that isn't actually declared
anywhere, and shadowed by a macro.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_globals.h | 23 -----------------------
 fs/xfs/linux-2.6/xfs_linux.h   |  1 -
 2 files changed, 24 deletions(-)
 delete mode 100644 fs/xfs/linux-2.6/xfs_globals.h

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-
-extern uint64_t	xfs_panic_mask;		/* set to cause more panics */
-
-#endif	/* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..1a2afcd87ef1 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -86,7 +86,6 @@
 #include <xfs_iops.h>
 #include <xfs_aops.h>
 #include <xfs_super.h>
-#include <xfs_globals.h>
 #include <xfs_buf.h>
 
 /*
-- 
cgit v1.2.3


From 6c77b0ea1bdf85dfd48c20ceb10fd215a95c66e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:17 +0000
Subject: xfs: remove xfs_cred.h

We're not actually passing around credentials inside XFS for a while
now, so remove all xfs_cred.h with it's cred_t typedef and all
instances of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_cred.h    | 28 ----------------------------
 fs/xfs/linux-2.6/xfs_globals.c |  1 -
 fs/xfs/linux-2.6/xfs_iops.c    |  4 ++--
 fs/xfs/linux-2.6/xfs_linux.h   |  2 +-
 fs/xfs/quota/xfs_qm.c          |  8 +++-----
 fs/xfs/xfs_inode.c             |  1 -
 fs/xfs/xfs_inode.h             |  5 ++---
 fs/xfs/xfs_mount.h             |  1 -
 fs/xfs/xfs_utils.c             |  5 ++---
 fs/xfs/xfs_utils.h             |  3 +--
 fs/xfs/xfs_vnodeops.c          | 12 +++++-------
 fs/xfs/xfs_vnodeops.h          |  6 ++----
 12 files changed, 18 insertions(+), 58 deletions(-)
 delete mode 100644 fs/xfs/linux-2.6/xfs_cred.h

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-
-#include <linux/capability.h>
-
-/*
- * Credentials
- */
-typedef const struct cred cred_t;
-
-#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_cred.h"
 #include "xfs_sysctl.h"
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index a788f016d1fa..ec858e09d546 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -189,7 +189,7 @@ xfs_vn_mknod(
 	}
 
 	xfs_dentry_to_name(&name, dentry);
-	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
+	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
 	if (unlikely(error))
 		goto out_free_acl;
 
@@ -362,7 +362,7 @@ xfs_vn_symlink(
 		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
+	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
 	if (unlikely(error))
 		goto out;
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 1a2afcd87ef1..76ebc582dba0 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -71,6 +71,7 @@
 #include <linux/random.h>
 #include <linux/ctype.h>
 #include <linux/writeback.h>
+#include <linux/capability.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -79,7 +80,6 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
-#include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 121f632213a7..a3f8f95d33ea 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,8 +55,6 @@ uint		ndquot;
 kmem_zone_t	*qm_dqzone;
 kmem_zone_t	*qm_dqtrxzone;
 
-static cred_t	xfs_zerocr;
-
 STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
@@ -1224,8 +1222,8 @@ xfs_qm_qino_alloc(
 		return error;
 	}
 
-	if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
-				   &xfs_zerocr, 0, 1, ip, &committed))) {
+	error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
+	if (error) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT);
 		return error;
@@ -2143,7 +2141,7 @@ xfs_qm_write_sb_changes(
 
 
 /*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * Given an inode, a uid, gid and prid make sure that we have
  * allocated relevant dquot(s) on disk, and that we won't exceed inode
  * quotas by creating this file.
  * This also attaches dquot(s) to the given inode after locking it,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..4fc72c9e3729 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -982,7 +982,6 @@ xfs_ialloc(
 	mode_t		mode,
 	xfs_nlink_t	nlink,
 	xfs_dev_t	rdev,
-	cred_t		*cr,
 	xfs_prid_t	prid,
 	int		okalloc,
 	xfs_buf_t	**ialloc_context,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 54781fcd322a..ac82327ec9db 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -212,7 +212,6 @@ typedef struct xfs_icdinode {
 #ifdef __KERNEL__
 
 struct bhv_desc;
-struct cred;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -456,8 +455,8 @@ void		xfs_inode_free(struct xfs_inode *ip);
  * xfs_inode.c prototypes.
  */
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-			   xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
-			   int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
+			   xfs_nlink_t, xfs_dev_t, xfs_prid_t, int,
+			   struct xfs_buf **, boolean_t *, xfs_inode_t **);
 
 uint		xfs_ip2xflags(struct xfs_inode *);
 uint		xfs_dic2xflags(struct xfs_dinode *);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 6509c0743957..5861b4980740 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
 
 #include "xfs_sync.h"
 
-struct cred;
 struct log;
 struct xfs_mount_args;
 struct xfs_inode;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4c2ba6f6adc8..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
 	mode_t		mode,
 	xfs_nlink_t	nlink,
 	xfs_dev_t	rdev,
-	cred_t		*credp,
 	prid_t		prid,		/* project id */
 	int		okalloc,	/* ok to allocate new space */
 	xfs_inode_t	**ipp,		/* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
 	 * transaction commit so that no other process can steal
 	 * the inode(s) that we've just allocated.
 	 */
-	code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
 			  &ialloc_context, &call_again, &ip);
 
 	/*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
 		 * other allocations in this allocation group,
 		 * this call should always succeed.
 		 */
-		code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
 				  okalloc, &ialloc_context, &call_again, &ip);
 
 		/*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
 #define __XFS_UTILS_H__
 
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-				xfs_dev_t, cred_t *, prid_t, int,
-				xfs_inode_t **, int *);
+				xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
 extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
 extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
 extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b7bdc43308e4..f82c8032db52 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1256,8 +1256,7 @@ xfs_create(
 	struct xfs_name		*name,
 	mode_t			mode,
 	xfs_dev_t		rdev,
-	xfs_inode_t		**ipp,
-	cred_t			*credp)
+	xfs_inode_t		**ipp)
 {
 	int			is_dir = S_ISDIR(mode);
 	struct xfs_mount	*mp = dp->i_mount;
@@ -1363,7 +1362,7 @@ xfs_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 			       prid, resblks > 0, &ip, &committed);
 	if (error) {
 		if (error == ENOSPC)
@@ -1936,8 +1935,7 @@ xfs_symlink(
 	struct xfs_name		*link_name,
 	const char		*target_path,
 	mode_t			mode,
-	xfs_inode_t		**ipp,
-	cred_t			*credp)
+	xfs_inode_t		**ipp)
 {
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t		*tp;
@@ -2049,8 +2047,8 @@ xfs_symlink(
 	/*
 	 * Allocate an inode for the symlink.
 	 */
-	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
-			       1, 0, credp, prid, resblks > 0, &ip, NULL);
+	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+			       prid, resblks > 0, &ip, NULL);
 	if (error) {
 		if (error == ENOSPC)
 			goto error_return;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..f6702927eee4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
 #define _XFS_VNODEOPS_H 1
 
 struct attrlist_cursor_kern;
-struct cred;
 struct file;
 struct iattr;
 struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-		xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
+		xfs_dev_t rdev, struct xfs_inode **ipp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-		const char *target_path, mode_t mode, struct xfs_inode **ipp,
-		cred_t *credp);
+		const char *target_path, mode_t mode, struct xfs_inode **ipp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-- 
cgit v1.2.3


From 1a1a3e97bad42e92cd2f32e81c396c8ee0bddb28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Oct 2010 18:41:18 +0000
Subject: xfs: remove xfs_buf wrappers

Stop having two different names for many buffer functions and use
the more descriptive xfs_buf_* names directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c   | 19 +++++++++----------
 fs/xfs/linux-2.6/xfs_buf.h   | 20 +++-----------------
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 fs/xfs/quota/xfs_qm.c        |  2 +-
 fs/xfs/xfs_attr.c            |  6 +++---
 fs/xfs/xfs_btree.c           |  4 ++--
 fs/xfs/xfs_buf_item.c        |  4 ++--
 fs/xfs/xfs_da_btree.c        |  2 +-
 fs/xfs/xfs_dir2_leaf.c       |  2 +-
 fs/xfs/xfs_ialloc.c          |  2 +-
 fs/xfs/xfs_inode.c           |  2 +-
 fs/xfs/xfs_log.c             |  2 +-
 fs/xfs/xfs_log_recover.c     |  6 +++---
 fs/xfs/xfs_mount.c           |  2 +-
 fs/xfs/xfs_trans_buf.c       |  2 +-
 fs/xfs/xfs_vnodeops.c        |  4 ++--
 16 files changed, 33 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 749d7d39d657..47ef97f46fe9 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -652,8 +652,7 @@ void
 xfs_buf_readahead(
 	xfs_buftarg_t		*target,
 	xfs_off_t		ioff,
-	size_t			isize,
-	xfs_buf_flags_t		flags)
+	size_t			isize)
 {
 	struct backing_dev_info *bdi;
 
@@ -661,8 +660,8 @@ xfs_buf_readahead(
 	if (bdi_read_congested(bdi))
 		return;
 
-	flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-	xfs_buf_read(target, ioff, isize, flags);
+	xfs_buf_read(target, ioff, isize,
+		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
 }
 
 /*
@@ -691,7 +690,7 @@ xfs_buf_read_uncached(
 	XFS_BUF_BUSY(bp);
 
 	xfsbdstrat(mp, bp);
-	error = xfs_iowait(bp);
+	error = xfs_buf_iowait(bp);
 	if (error || bp->b_error) {
 		xfs_buf_relse(bp);
 		return NULL;
@@ -1073,7 +1072,7 @@ xfs_bdwrite(
 
 /*
  * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
  * so that the proper iodone callbacks get called.
  */
 STATIC int
@@ -1090,21 +1089,21 @@ xfs_bioerror(
 	XFS_BUF_ERROR(bp, EIO);
 
 	/*
-	 * We're calling biodone, so delete XBF_DONE flag.
+	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
 	 */
 	XFS_BUF_UNREAD(bp);
 	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_UNDONE(bp);
 	XFS_BUF_STALE(bp);
 
-	xfs_biodone(bp);
+	xfs_buf_ioend(bp, 0);
 
 	return EIO;
 }
 
 /*
  * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
  * This is meant for userdata errors; metadata bufs come with
  * iodone functions attached, so that we can track down errors.
  */
@@ -1938,7 +1937,7 @@ xfs_flush_buftarg(
 			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
 			list_del_init(&bp->b_list);
-			xfs_iowait(bp);
+			xfs_buf_iowait(bp);
 			xfs_buf_relse(bp);
 		}
 	}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 161333785f69..131c0ebf2c0d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -218,8 +218,7 @@ extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
-				xfs_buf_flags_t);
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
 struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
 				struct xfs_buftarg *target,
 				xfs_daddr_t daddr, size_t length, int flags);
@@ -247,6 +246,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 				xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
@@ -359,21 +360,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 	xfs_buf_rele(bp);
 }
 
-#define xfs_biodone(bp)		xfs_buf_ioend(bp, 0)
-
-#define xfs_biomove(bp, off, len, data, rw) \
-	    xfs_buf_iomove((bp), (off), (len), (data), \
-		((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
-
-#define xfs_biozero(bp, off, len) \
-	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-
-#define xfs_iowait(bp)	xfs_buf_iowait(bp)
-
-#define xfs_baread(target, rablkno, ralen)  \
-	xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
-
-
 /*
  *	Handling of buftargs.
  */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 48d5f8206549..fa1e40ac4b35 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -644,7 +644,7 @@ xfs_barrier_test(
 	XFS_BUF_ORDERED(sbp);
 
 	xfsbdstrat(mp, sbp);
-	error = xfs_iowait(sbp);
+	error = xfs_buf_iowait(sbp);
 
 	/*
 	 * Clear all the flags we set and possible error state in the
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index a3f8f95d33ea..d109cc557bed 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1433,7 +1433,7 @@ xfs_qm_dqiterate(
 				rablkcnt =  map[i+1].br_blockcount;
 				rablkno = map[i+1].br_startblock;
 				while (rablkcnt--) {
-					xfs_baread(mp->m_ddev_targp,
+					xfs_buf_readahead(mp->m_ddev_targp,
 					       XFS_FSB_TO_DADDR(mp, rablkno),
 					       mp->m_quotainfo->qi_dqchunklen);
 					rablkno++;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 905d390c1e5c..c86375378810 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -1986,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 
 			tmp = (valuelen < XFS_BUF_SIZE(bp))
 				? valuelen : XFS_BUF_SIZE(bp);
-			xfs_biomove(bp, 0, tmp, dst, XBF_READ);
+			xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
 			xfs_buf_relse(bp);
 			dst += tmp;
 			valuelen -= tmp;
@@ -2116,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
-		xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
+		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
-			xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+			xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
 		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
 			return (error);
 		}
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 735dc2e671b1..04f9cca8da7e 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -656,7 +656,7 @@ xfs_btree_reada_bufl(
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 
 /*
@@ -676,7 +676,7 @@ xfs_btree_reada_bufs(
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agbno != NULLAGBLOCK);
 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ee7557611b6e..2686d0d54c5b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -973,7 +973,7 @@ xfs_buf_iodone_callbacks(
 			xfs_buf_do_callbacks(bp, lip);
 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
 			XFS_BUF_CLR_IODONE_FUNC(bp);
-			xfs_biodone(bp);
+			xfs_buf_ioend(bp, 0);
 			return;
 		}
 
@@ -1032,7 +1032,7 @@ xfs_buf_iodone_callbacks(
 	xfs_buf_do_callbacks(bp, lip);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	xfs_biodone(bp);
+	xfs_buf_ioend(bp, 0);
 }
 
 /*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..1c00bedb3175 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2042,7 +2042,7 @@ xfs_da_do_buf(
 				mappedbno, nmapped, 0, &bp);
 			break;
 		case 3:
-			xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+			xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
 			error = 0;
 			bp = NULL;
 			break;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
 				if (i > ra_current &&
 				    map[ra_index].br_blockcount >=
 				    mp->m_dirblkfsbs) {
-					xfs_baread(mp->m_ddev_targp,
+					xfs_buf_readahead(mp->m_ddev_targp,
 						XFS_FSB_TO_DADDR(mp,
 						   map[ra_index].br_startblock +
 						   ra_offset),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..0626a32c3447 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
 		 *	to log a whole cluster of inodes instead of all the
 		 *	individual transactions causing a lot of log traffic.
 		 */
-		xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
 		for (i = 0; i < ninodes; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
 			uint	isize = sizeof(struct xfs_dinode);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4fc72c9e3729..493d6b0cbef2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2724,7 +2724,7 @@ cluster_corrupt_out:
 			XFS_BUF_UNDONE(bp);
 			XFS_BUF_STALE(bp);
 			XFS_BUF_ERROR(bp,EIO);
-			xfs_biodone(bp);
+			xfs_buf_ioend(bp, 0);
 		} else {
 			XFS_BUF_STALE(bp);
 			xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c8a309424307..f4fd49c9b987 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1310,7 +1310,7 @@ xlog_bdstrat(
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		XFS_BUF_ERROR(bp, EIO);
 		XFS_BUF_STALE(bp);
-		xfs_biodone(bp);
+		xfs_buf_ioend(bp, 0);
 		/*
 		 * It would seem logical to return EIO here, but we rely on
 		 * the log state machine to propagate I/O errors instead of
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 351d71117f16..966d3f97458c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -168,7 +168,7 @@ xlog_bread_noalign(
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 
 	xfsbdstrat(log->l_mp, bp);
-	error = xfs_iowait(bp);
+	error = xfs_buf_iowait(bp);
 	if (error)
 		xfs_ioerror_alert("xlog_bread", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
@@ -328,7 +328,7 @@ xlog_recover_iodone(
 					SHUTDOWN_META_IO_ERROR);
 	}
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	xfs_biodone(bp);
+	xfs_buf_ioend(bp, 0);
 }
 
 /*
@@ -3816,7 +3816,7 @@ xlog_do_recover(
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	xfsbdstrat(log->l_mp, bp);
-	error = xfs_iowait(bp);
+	error = xfs_buf_iowait(bp);
 	if (error) {
 		xfs_ioerror_alert("xlog_do_recover",
 				  log->l_mp, bp, XFS_BUF_ADDR(bp));
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00538b7e694a..b1498ab5a399 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1607,7 +1607,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
 		xfsbdstrat(mp, sbp);
-		error = xfs_iowait(sbp);
+		error = xfs_buf_iowait(sbp);
 		if (error)
 			xfs_ioerror_alert("xfs_unmountfs_writesb",
 					  mp, sbp, XFS_BUF_ADDR(sbp));
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..c47918c302a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
 			ASSERT(!XFS_BUF_ISASYNC(bp));
 			XFS_BUF_READ(bp);
 			xfsbdstrat(tp->t_mountp, bp);
-			error = xfs_iowait(bp);
+			error = xfs_buf_iowait(bp);
 			if (error) {
 				xfs_ioerror_alert("xfs_trans_read_buf", mp,
 						  bp, blkno);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f82c8032db52..f4195f6b8bc4 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2460,7 +2460,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_READ(bp);
 		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 		xfsbdstrat(mp, bp);
-		error = xfs_iowait(bp);
+		error = xfs_buf_iowait(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
 					  mp, bp, XFS_BUF_ADDR(bp));
@@ -2473,7 +2473,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNREAD(bp);
 		XFS_BUF_WRITE(bp);
 		xfsbdstrat(mp, bp);
-		error = xfs_iowait(bp);
+		error = xfs_buf_iowait(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
 					  mp, bp, XFS_BUF_ADDR(bp));
-- 
cgit v1.2.3


From 6743099ce57a40509a86849a22317ed4b7516911 Mon Sep 17 00:00:00 2001
From: Arkadiusz Mi?kiewicz <arekm@maven.pl>
Date: Sun, 26 Sep 2010 06:10:18 +0000
Subject: xfs: Extend project quotas to support 32bit project ids
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for 32bit project quota identifiers.

On disk format is backward compatible with 16bit projid numbers. projid
on disk is now kept in two 16bit values - di_projid_lo (which holds the
same position as old 16bit projid value) and new di_projid_hi (takes
existing padding) and converts from/to 32bit value on the fly.

xfs_admin (for existing fs), mkfs.xfs (for new fs) needs to be used
to enable PROJID32BIT support.

Signed-off-by: Arkadiusz Miśkiewicz <arekm@maven.pl>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c   | 14 +++++++-------
 fs/xfs/linux-2.6/xfs_ioctl32.c |  4 +++-
 fs/xfs/linux-2.6/xfs_ioctl32.h |  6 ++++--
 fs/xfs/linux-2.6/xfs_linux.h   |  2 +-
 fs/xfs/quota/xfs_qm.c          | 10 +++++-----
 fs/xfs/quota/xfs_qm_bhv.c      |  2 +-
 fs/xfs/quota/xfs_qm_syscalls.c |  2 +-
 fs/xfs/xfs_dinode.h            |  5 +++--
 fs/xfs/xfs_fs.h                |  6 ++++--
 fs/xfs/xfs_inode.c             | 14 ++++++++------
 fs/xfs/xfs_inode.h             | 26 +++++++++++++++++++++++---
 fs/xfs/xfs_itable.c            |  3 ++-
 fs/xfs/xfs_rename.c            |  2 +-
 fs/xfs/xfs_sb.h                | 10 +++++++++-
 fs/xfs/xfs_types.h             |  2 --
 fs/xfs/xfs_vnodeops.c          | 16 ++++++++--------
 16 files changed, 80 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 10206be7a077..2ea238f6d38e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -790,7 +790,7 @@ xfs_ioc_fsgetxattr(
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	fa.fsx_xflags = xfs_ip2xflags(ip);
 	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-	fa.fsx_projid = ip->i_d.di_projid;
+	fa.fsx_projid = xfs_get_projid(ip);
 
 	if (attr) {
 		if (ip->i_afp) {
@@ -909,10 +909,10 @@ xfs_ioctl_setattr(
 		return XFS_ERROR(EIO);
 
 	/*
-	 * Disallow 32bit project ids because on-disk structure
-	 * is 16bit only.
+	 * Disallow 32bit project ids when projid32bit feature is not enabled.
 	 */
-	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+			!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
 		return XFS_ERROR(EINVAL);
 
 	/*
@@ -961,7 +961,7 @@ xfs_ioctl_setattr(
 	if (mask & FSX_PROJID) {
 		if (XFS_IS_QUOTA_RUNNING(mp) &&
 		    XFS_IS_PQUOTA_ON(mp) &&
-		    ip->i_d.di_projid != fa->fsx_projid) {
+		    xfs_get_projid(ip) != fa->fsx_projid) {
 			ASSERT(tp);
 			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
@@ -1063,12 +1063,12 @@ xfs_ioctl_setattr(
 		 * Change the ownerships and register quota modifications
 		 * in the transaction.
 		 */
-		if (ip->i_d.di_projid != fa->fsx_projid) {
+		if (xfs_get_projid(ip) != fa->fsx_projid) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
 				olddquot = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
-			ip->i_d.di_projid = fa->fsx_projid;
+			xfs_set_projid(ip, fa->fsx_projid);
 
 			/*
 			 * We may have to rev the inode as well as
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 464bcc76e980..b3486dfa5520 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
 	    get_user(bstat->bs_extsize,	&bstat32->bs_extsize)	||
 	    get_user(bstat->bs_extents,	&bstat32->bs_extents)	||
 	    get_user(bstat->bs_gen,	&bstat32->bs_gen)	||
-	    get_user(bstat->bs_projid,	&bstat32->bs_projid)	||
+	    get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+	    get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
 	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
 	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
 	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
 	    put_user(buffer->bs_extents,  &p32->bs_extents)	||
 	    put_user(buffer->bs_gen,	  &p32->bs_gen)		||
 	    put_user(buffer->bs_projid,	  &p32->bs_projid)	||
+	    put_user(buffer->bs_projid_hi,	&p32->bs_projid_hi)	||
 	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
 	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
 	    put_user(buffer->bs_aextents, &p32->bs_aextents))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..08b605792a99 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
 	__s32		bs_extsize;	/* extent size			*/
 	__s32		bs_extents;	/* number of extents		*/
 	__u32		bs_gen;		/* generation count		*/
-	__u16		bs_projid;	/* project id			*/
-	unsigned char	bs_pad[14];	/* pad space, unused		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_projid_hi;	/* high part of project id	*/
+	unsigned char	bs_pad[12];	/* pad space, unused		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 76ebc582dba0..214ddd71ff79 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -143,7 +143,7 @@
 #define SYNCHRONIZE()	barrier()
 #define __return_address __builtin_return_address(0)
 
-#define dfltprid	0
+#define XFS_PROJID_DEFAULT	0
 #define MAXPATHLEN	1024
 
 #define MIN(a,b)	(min(a,b))
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index d109cc557bed..f8e854b4fde8 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -835,7 +835,7 @@ xfs_qm_dqattach_locked(
 			xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
 						flags & XFS_QMOPT_DQALLOC,
 						ip->i_udquot, &ip->i_gdquot) :
-			xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+			xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
 						flags & XFS_QMOPT_DQALLOC,
 						ip->i_udquot, &ip->i_gdquot);
 		/*
@@ -1630,7 +1630,7 @@ xfs_qm_dqusage_adjust(
 	}
 
 	if (XFS_IS_PQUOTA_ON(mp)) {
-		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_projid,
+		error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
 						   XFS_DQ_PROJ, nblks, rtblks);
 		if (error)
 			goto error0;
@@ -2249,7 +2249,7 @@ xfs_qm_vop_dqalloc(
 			xfs_dqunlock(gq);
 		}
 	} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-		if (ip->i_d.di_projid != prid) {
+		if (xfs_get_projid(ip) != prid) {
 			xfs_iunlock(ip, lockflags);
 			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
 						 XFS_DQ_PROJ,
@@ -2371,7 +2371,7 @@ xfs_qm_vop_chown_reserve(
 	}
 	if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
 		if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-		     ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))
+		     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
 			prjflags = XFS_QMOPT_ENOSPC;
 
 		if (prjflags ||
@@ -2475,7 +2475,7 @@ xfs_qm_vop_create_dqattach(
 		ip->i_gdquot = gdqp;
 		ASSERT(XFS_IS_OQUOTA_ON(mp));
 		ASSERT((XFS_IS_GQUOTA_ON(mp) ?
-			ip->i_d.di_gid : ip->i_d.di_projid) ==
+			ip->i_d.di_gid : xfs_get_projid(ip)) ==
 				be32_to_cpu(gdqp->q_core.d_id));
 		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..45b5cb1788ab 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_dquot_t		*dqp;
 
-	if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
+	if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
 		xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
 		xfs_qm_dqput(dqp);
 	}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 57847434fa51..bdebc183223e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1165,7 +1165,7 @@ xfs_qm_internalqcheck_adjust(
 	}
 	xfs_qm_internalqcheck_get_dquots(mp,
 					(xfs_dqid_t) ip->i_d.di_uid,
-					(xfs_dqid_t) ip->i_d.di_projid,
+					(xfs_dqid_t) xfs_get_projid(ip),
 					(xfs_dqid_t) ip->i_d.di_gid,
 					&ud, &gd);
 	if (XFS_IS_UQUOTA_ON(mp)) {
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
 	__be32		di_uid;		/* owner's user id */
 	__be32		di_gid;		/* owner's group id */
 	__be32		di_nlink;	/* number of links to file */
-	__be16		di_projid;	/* owner's project id */
-	__u8		di_pad[8];	/* unused, zeroed space */
+	__be16		di_projid_lo;	/* lower part of owner's project id */
+	__be16		di_projid_hi;	/* higher part owner's project id */
+	__u8		di_pad[6];	/* unused, zeroed space */
 	__be16		di_flushiter;	/* incremented on flush */
 	xfs_timestamp_t	di_atime;	/* time last accessed */
 	xfs_timestamp_t	di_mtime;	/* time last modified */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 10f6093671b0..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
 	__s32		bs_extsize;	/* extent size			*/
 	__s32		bs_extents;	/* number of extents		*/
 	__u32		bs_gen;		/* generation count		*/
-	__u16		bs_projid;	/* project id			*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
 	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
-	unsigned char	bs_pad[12];	/* pad space, unused		*/
+	__u16		bs_projid_hi;	/* higher part of project id	*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 493d6b0cbef2..108c7a085f94 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -660,7 +660,8 @@ xfs_dinode_from_disk(
 	to->di_uid = be32_to_cpu(from->di_uid);
 	to->di_gid = be32_to_cpu(from->di_gid);
 	to->di_nlink = be32_to_cpu(from->di_nlink);
-	to->di_projid = be16_to_cpu(from->di_projid);
+	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
 	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +696,8 @@ xfs_dinode_to_disk(
 	to->di_uid = cpu_to_be32(from->di_uid);
 	to->di_gid = cpu_to_be32(from->di_gid);
 	to->di_nlink = cpu_to_be32(from->di_nlink);
-	to->di_projid = cpu_to_be16(from->di_projid);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = cpu_to_be16(from->di_flushiter);
 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -874,7 +876,7 @@ xfs_iread(
 	if (ip->i_d.di_version == 1) {
 		ip->i_d.di_nlink = ip->i_d.di_onlink;
 		ip->i_d.di_onlink = 0;
-		ip->i_d.di_projid = 0;
+		xfs_set_projid(ip, 0);
 	}
 
 	ip->i_delayed_blks = 0;
@@ -982,7 +984,7 @@ xfs_ialloc(
 	mode_t		mode,
 	xfs_nlink_t	nlink,
 	xfs_dev_t	rdev,
-	xfs_prid_t	prid,
+	prid_t		prid,
 	int		okalloc,
 	xfs_buf_t	**ialloc_context,
 	boolean_t	*call_again,
@@ -1026,7 +1028,7 @@ xfs_ialloc(
 	ASSERT(ip->i_d.di_nlink == nlink);
 	ip->i_d.di_uid = current_fsuid();
 	ip->i_d.di_gid = current_fsgid();
-	ip->i_d.di_projid = prid;
+	xfs_set_projid(ip, prid);
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
 	/*
@@ -3007,7 +3009,7 @@ xfs_iflush_int(
 			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 			memset(&(dip->di_pad[0]), 0,
 			      sizeof(dip->di_pad));
-			ASSERT(ip->i_d.di_projid == 0);
+			ASSERT(xfs_get_projid(ip) == 0);
 		}
 	}
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ac82327ec9db..fac52290de90 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
 	__uint32_t	di_nlink;	/* number of links to file */
-	__uint16_t	di_projid;	/* owner's project id */
-	__uint8_t	di_pad[8];	/* unused, zeroed space */
+	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
+	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
+	__uint8_t	di_pad[6];	/* unused, zeroed space */
 	__uint16_t	di_flushiter;	/* incremented on flush */
 	xfs_ictimestamp_t di_atime;	/* time last accessed */
 	xfs_ictimestamp_t di_mtime;	/* time last modified */
@@ -333,6 +334,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 	return ret;
 }
 
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+	return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+		prid_t projid)
+{
+	ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+
 /*
  * Manage the i_flush queue embedded in the inode.  This completion
  * queue synchronizes processes attempting to flush the in-core
@@ -455,7 +475,7 @@ void		xfs_inode_free(struct xfs_inode *ip);
  * xfs_inode.c prototypes.
  */
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-			   xfs_nlink_t, xfs_dev_t, xfs_prid_t, int,
+			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, boolean_t *, xfs_inode_t **);
 
 uint		xfs_ip2xflags(struct xfs_inode *);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..dc1882adaf54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
 	 * further change.
 	 */
 	buf->bs_nlink = dic->di_nlink;
-	buf->bs_projid = dic->di_projid;
+	buf->bs_projid_lo = dic->di_projid_lo;
+	buf->bs_projid_hi = dic->di_projid_hi;
 	buf->bs_ino = ino;
 	buf->bs_mode = dic->di_mode;
 	buf->bs_uid = dic->di_uid;
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 9028733f7ed8..d2af0a8381a6 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
 	 * tree quota mechanism would be circumvented.
 	 */
 	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+		     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
 		error = XFS_ERROR(EXDEV);
 		goto error_return;
 	}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED4BIT	0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT	0x00000010	/* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT	0x00000080	/* 32 bit project id */
 
 #define	XFS_SB_VERSION2_OKREALFBITS	\
 	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
-	 XFS_SB_VERSION2_ATTR2BIT)
+	 XFS_SB_VERSION2_ATTR2BIT	| \
+	 XFS_SB_VERSION2_PROJID32BIT)
 #define	XFS_SB_VERSION2_OKSASHFBITS	\
 	(0)
 #define XFS_SB_VERSION2_OKREALBITS	\
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
 		sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
 
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+	return xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
+
 /*
  * end of superblock version macros
  */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..26d1867d8156 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef	__uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
 typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 
-typedef __uint16_t	xfs_prid_t;	/* prid_t truncated to 16bits in XFS */
-
 typedef __uint32_t	xlog_tid_t;	/* transaction ID type */
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f4195f6b8bc4..8e4a63c4151a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
+		code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
 					 qflags, &udqp, &gdqp);
 		if (code)
 			return code;
@@ -1268,7 +1268,7 @@ xfs_create(
 	boolean_t		unlock_dp_on_error = B_FALSE;
 	uint			cancel_flags;
 	int			committed;
-	xfs_prid_t		prid;
+	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
 	uint			resblks;
@@ -1281,9 +1281,9 @@ xfs_create(
 		return XFS_ERROR(EIO);
 
 	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-		prid = dp->i_d.di_projid;
+		prid = xfs_get_projid(dp);
 	else
-		prid = dfltprid;
+		prid = XFS_PROJID_DEFAULT;
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
@@ -1882,7 +1882,7 @@ xfs_link(
 	 * the tree quota mechanism could be circumvented.
 	 */
 	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
+		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
 		error = XFS_ERROR(EXDEV);
 		goto error_return;
 	}
@@ -1956,7 +1956,7 @@ xfs_symlink(
 	int			byte_cnt;
 	int			n;
 	xfs_buf_t		*bp;
-	xfs_prid_t		prid;
+	prid_t			prid;
 	struct xfs_dquot	*udqp, *gdqp;
 	uint			resblks;
 
@@ -1979,9 +1979,9 @@ xfs_symlink(
 
 	udqp = gdqp = NULL;
 	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-		prid = dp->i_d.di_projid;
+		prid = xfs_get_projid(dp);
 	else
-		prid = (xfs_prid_t)dfltprid;
+		prid = XFS_PROJID_DEFAULT;
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
-- 
cgit v1.2.3


From a731cd116c9334e01bcf3e676c0c621fe7de6ce4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Sep 2010 14:33:15 +0000
Subject: xfs: semaphore cleanup

Get rid of init_MUTEX[_LOCKED]() and use sema_init() instead.

(Ported to current XFS code by <aelder@sgi.com>.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 47ef97f46fe9..52785189212f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -189,7 +189,7 @@ _xfs_buf_initialize(
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_list);
 	RB_CLEAR_NODE(&bp->b_rbnode);
-	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	XB_SET_OWNER(bp);
 	bp->b_target = target;
 	bp->b_file_offset = range_base;
-- 
cgit v1.2.3


From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Tue, 19 Oct 2010 09:05:00 +0200
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

When reloading partition tables, quiesce IO to ensure that no
request references to the partition struct exists. When it is safe
to free the partition table, the IO for that device is restarted
again.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         | 24 ++++++++++++++++--------
 block/blk-merge.c        |  2 +-
 block/blk.h              |  4 ----
 block/genhd.c            | 14 ++++++++++++++
 fs/partitions/check.c    | 12 ++++++++++++
 include/linux/blkdev.h   |  1 +
 include/linux/elevator.h |  2 ++
 include/linux/genhd.h    |  1 +
 8 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/block/blk-core.c b/block/blk-core.c
index 797d5095eb83..ddc68332d655 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
+	if (priv) {
 		rl->elvpriv++;
 
-	if (blk_queue_io_stat(q))
-		rw_flags |= REQ_IO_STAT;
+		/*
+		 * Don't do stats for non-priv requests
+		 */
+		if (blk_queue_io_stat(q))
+			rw_flags |= REQ_IO_STAT;
+	}
+
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6a725461654d..38ff234012a4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 6738831ba447..1340cce5721a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index 7923e720ddf5..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
+	struct gendisk *disk = ptbl->disk;
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	kfree(ptbl);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+
+		spin_lock_irq(q->queue_lock);
+		elv_quiesce_start(q);
+		spin_unlock_irq(q->queue_lock);
+
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
+	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6dfbee03ccc6..30f46c2cb9d5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,17 +365,25 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+	struct gendisk *disk = part_to_disk(part);
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
+	struct request_queue *q = disk->queue;
 
 	if (partno >= ptbl->len)
 		return;
@@ -390,6 +398,10 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
+	spin_lock_irq(q->queue_lock);
+	elv_quiesce_start(q);
+	spin_unlock_irq(q->queue_lock);
+
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8f3dd981b973..16f7f1be1acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2c958f4fce1e..df1ee866d715 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -121,6 +121,8 @@ extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
+extern void elv_quiesce_start(struct request_queue *);
+extern void elv_quiesce_end(struct request_queue *);
 
 /*
  * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 66e26b5a1537..57647ecfc1bd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
 	struct hd_struct *last_lookup;
+	struct gendisk *disk;
 	struct hd_struct *part[];
 };
 
-- 
cgit v1.2.3


From c4a047272566b44b44222369d50a307c708c4f74 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 24 Aug 2009 22:42:56 +0000
Subject: fix rawctl compat ioctls breakage on amd64 and itanic

RAW_SETBIND and RAW_GETBIND 32bit versions are fscked in interesting ways.

1) fs/compat_ioctl.c has COMPATIBLE_IOCTL(RAW_SETBIND) followed by
HANDLE_IOCTL(RAW_SETBIND, raw_ioctl).  The latter is ignored.

2) on amd64 (and itanic) the damn thing is broken - we have int + u64 + u64
and layouts on i386 and amd64 are _not_ the same.  raw_ioctl() would
work there, but it's never called due to (1).  As it is, i386 /sbin/raw
definitely doesn't work on amd64 boxen.

3) switching to raw_ioctl() as is would *not* work on e.g. sparc64 and ppc64,
which would be rather sad, seeing that normal userland there is 32bit.
The thing is, slapping __packed on the struct in question does not DTRT -
it eliminates *all* padding.  The real solution is to use compat_u64.

4) of course, all that stuff has no business being outside of raw.c in the
first place - there should be ->compat_ioctl() for /dev/rawctl instead of
messing with compat_ioctl.c.

[akpm@linux-foundation.org: coding-style fixes]
[arnd@arndb.de: port to 2.6.36]
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/char/raw.c | 243 ++++++++++++++++++++++++++++++-----------------------
 fs/compat_ioctl.c  |  70 ---------------
 2 files changed, 140 insertions(+), 173 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index b38942f6bf31..24b2b9160aa6 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -19,8 +19,8 @@
 #include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 
@@ -55,7 +55,6 @@ static int raw_open(struct inode *inode, struct file *filp)
 		return 0;
 	}
 
-	lock_kernel();
 	mutex_lock(&raw_mutex);
 
 	/*
@@ -82,7 +81,6 @@ static int raw_open(struct inode *inode, struct file *filp)
 			bdev->bd_inode->i_mapping;
 	filp->private_data = bdev;
 	mutex_unlock(&raw_mutex);
-	unlock_kernel();
 	return 0;
 
 out2:
@@ -91,7 +89,6 @@ out1:
 	blkdev_put(bdev, filp->f_mode);
 out:
 	mutex_unlock(&raw_mutex);
-	unlock_kernel();
 	return err;
 }
 
@@ -125,20 +122,84 @@ static long
 raw_ioctl(struct file *filp, unsigned int command, unsigned long arg)
 {
 	struct block_device *bdev = filp->private_data;
-	int ret;
+	return blkdev_ioctl(bdev, 0, command, arg);
+}
+
+static int bind_set(int number, u64 major, u64 minor)
+{
+	dev_t dev = MKDEV(major, minor);
+	struct raw_device_data *rawdev;
+	int err = 0;
 
-	lock_kernel();
-	ret = blkdev_ioctl(bdev, 0, command, arg);
-	unlock_kernel();
+	if (number <= 0 || number >= MAX_RAW_MINORS)
+		return -EINVAL;
 
-	return ret;
+	if (MAJOR(dev) != major || MINOR(dev) != minor)
+		return -EINVAL;
+
+	rawdev = &raw_devices[number];
+
+	/*
+	 * This is like making block devices, so demand the
+	 * same capability
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * For now, we don't need to check that the underlying
+	 * block device is present or not: we can do that when
+	 * the raw device is opened.  Just check that the
+	 * major/minor numbers make sense.
+	 */
+
+	if (MAJOR(dev) == 0 && dev != 0)
+		return -EINVAL;
+
+	mutex_lock(&raw_mutex);
+	if (rawdev->inuse) {
+		mutex_unlock(&raw_mutex);
+		return -EBUSY;
+	}
+	if (rawdev->binding) {
+		bdput(rawdev->binding);
+		module_put(THIS_MODULE);
+	}
+	if (!dev) {
+		/* unbind */
+		rawdev->binding = NULL;
+		device_destroy(raw_class, MKDEV(RAW_MAJOR, number));
+	} else {
+		rawdev->binding = bdget(dev);
+		if (rawdev->binding == NULL) {
+			err = -ENOMEM;
+		} else {
+			dev_t raw = MKDEV(RAW_MAJOR, number);
+			__module_get(THIS_MODULE);
+			device_destroy(raw_class, raw);
+			device_create(raw_class, NULL, raw, NULL,
+				      "raw%d", number);
+		}
+	}
+	mutex_unlock(&raw_mutex);
+	return err;
 }
 
-static void bind_device(struct raw_config_request *rq)
+static int bind_get(int number, dev_t *dev)
 {
-	device_destroy(raw_class, MKDEV(RAW_MAJOR, rq->raw_minor));
-	device_create(raw_class, NULL, MKDEV(RAW_MAJOR, rq->raw_minor), NULL,
-		      "raw%d", rq->raw_minor);
+	struct raw_device_data *rawdev;
+	struct block_device *bdev;
+
+	if (number <= 0 || number >= MAX_RAW_MINORS)
+		return -EINVAL;
+
+	rawdev = &raw_devices[number];
+
+	mutex_lock(&raw_mutex);
+	bdev = rawdev->binding;
+	*dev = bdev ? bdev->bd_dev : 0;
+	mutex_unlock(&raw_mutex);
+	return 0;
 }
 
 /*
@@ -149,105 +210,78 @@ static long raw_ctl_ioctl(struct file *filp, unsigned int command,
 			  unsigned long arg)
 {
 	struct raw_config_request rq;
-	struct raw_device_data *rawdev;
-	int err = 0;
+	dev_t dev;
+	int err;
 
-	lock_kernel();
 	switch (command) {
 	case RAW_SETBIND:
+		if (copy_from_user(&rq, (void __user *) arg, sizeof(rq)))
+			return -EFAULT;
+
+		return bind_set(rq.raw_minor, rq.block_major, rq.block_minor);
+
 	case RAW_GETBIND:
+		if (copy_from_user(&rq, (void __user *) arg, sizeof(rq)))
+			return -EFAULT;
 
-		/* First, find out which raw minor we want */
+		err = bind_get(rq.raw_minor, &dev);
+		if (err)
+			return err;
 
-		if (copy_from_user(&rq, (void __user *) arg, sizeof(rq))) {
-			err = -EFAULT;
-			goto out;
-		}
+		rq.block_major = MAJOR(dev);
+		rq.block_minor = MINOR(dev);
 
-		if (rq.raw_minor <= 0 || rq.raw_minor >= MAX_RAW_MINORS) {
-			err = -EINVAL;
-			goto out;
-		}
-		rawdev = &raw_devices[rq.raw_minor];
-
-		if (command == RAW_SETBIND) {
-			dev_t dev;
-
-			/*
-			 * This is like making block devices, so demand the
-			 * same capability
-			 */
-			if (!capable(CAP_SYS_ADMIN)) {
-				err = -EPERM;
-				goto out;
-			}
-
-			/*
-			 * For now, we don't need to check that the underlying
-			 * block device is present or not: we can do that when
-			 * the raw device is opened.  Just check that the
-			 * major/minor numbers make sense.
-			 */
-
-			dev = MKDEV(rq.block_major, rq.block_minor);
-			if ((rq.block_major == 0 && rq.block_minor != 0) ||
-					MAJOR(dev) != rq.block_major ||
-					MINOR(dev) != rq.block_minor) {
-				err = -EINVAL;
-				goto out;
-			}
-
-			mutex_lock(&raw_mutex);
-			if (rawdev->inuse) {
-				mutex_unlock(&raw_mutex);
-				err = -EBUSY;
-				goto out;
-			}
-			if (rawdev->binding) {
-				bdput(rawdev->binding);
-				module_put(THIS_MODULE);
-			}
-			if (rq.block_major == 0 && rq.block_minor == 0) {
-				/* unbind */
-				rawdev->binding = NULL;
-				device_destroy(raw_class,
-						MKDEV(RAW_MAJOR, rq.raw_minor));
-			} else {
-				rawdev->binding = bdget(dev);
-				if (rawdev->binding == NULL)
-					err = -ENOMEM;
-				else {
-					__module_get(THIS_MODULE);
-					bind_device(&rq);
-				}
-			}
-			mutex_unlock(&raw_mutex);
-		} else {
-			struct block_device *bdev;
-
-			mutex_lock(&raw_mutex);
-			bdev = rawdev->binding;
-			if (bdev) {
-				rq.block_major = MAJOR(bdev->bd_dev);
-				rq.block_minor = MINOR(bdev->bd_dev);
-			} else {
-				rq.block_major = rq.block_minor = 0;
-			}
-			mutex_unlock(&raw_mutex);
-			if (copy_to_user((void __user *)arg, &rq, sizeof(rq))) {
-				err = -EFAULT;
-				goto out;
-			}
-		}
-		break;
-	default:
-		err = -EINVAL;
-		break;
+		if (copy_to_user((void __user *)arg, &rq, sizeof(rq)))
+			return -EFAULT;
+
+		return 0;
 	}
-out:
-	unlock_kernel();
-	return err;
+
+	return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+struct raw32_config_request {
+	compat_int_t	raw_minor;
+	compat_u64	block_major;
+	compat_u64	block_minor;
+};
+
+static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct raw32_config_request __user *user_req = compat_ptr(arg);
+	struct raw32_config_request rq;
+	dev_t dev;
+	int err = 0;
+
+	switch (cmd) {
+	case RAW_SETBIND:
+		if (copy_from_user(&rq, user_req, sizeof(rq)))
+			return -EFAULT;
+
+		return bind_set(rq.raw_minor, rq.block_major, rq.block_minor);
+
+	case RAW_GETBIND:
+		if (copy_from_user(&rq, user_req, sizeof(rq)))
+			return -EFAULT;
+
+		err = bind_get(rq.raw_minor, &dev);
+		if (err)
+			return err;
+
+		rq.block_major = MAJOR(dev);
+		rq.block_minor = MINOR(dev);
+
+		if (copy_to_user(user_req, &rq, sizeof(rq)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -EINVAL;
 }
+#endif
 
 static const struct file_operations raw_fops = {
 	.read		= do_sync_read,
@@ -263,6 +297,9 @@ static const struct file_operations raw_fops = {
 
 static const struct file_operations raw_ctl_fops = {
 	.unlocked_ioctl = raw_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= raw_ctl_compat_ioctl,
+#endif
 	.open		= raw_open,
 	.owner		= THIS_MODULE,
 };
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..d0ad09d57789 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -599,69 +599,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST	_IOR('H', 210, int)
 #define HIDPGETCONNINFO	_IOR('H', 211, int)
 
-#ifdef CONFIG_BLOCK
-struct raw32_config_request
-{
-        compat_int_t    raw_minor;
-        __u64   block_major;
-        __u64   block_minor;
-} __attribute__((packed));
-
-static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-        int ret;
-
-        if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-
-        ret = __get_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __get_user(req->block_major, &user_req->block_major);
-        ret |= __get_user(req->block_minor, &user_req->block_minor);
-
-        return ret ? -EFAULT : 0;
-}
-
-static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-	int ret;
-
-        if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-
-        ret = __put_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __put_user(req->block_major, &user_req->block_major);
-        ret |= __put_user(req->block_minor, &user_req->block_minor);
-
-        return ret ? -EFAULT : 0;
-}
-
-static int raw_ioctl(unsigned fd, unsigned cmd,
-		struct raw32_config_request __user *user_req)
-{
-        int ret;
-
-        switch (cmd) {
-        case RAW_SETBIND:
-	default: {	/* RAW_GETBIND */
-                struct raw_config_request req;
-                mm_segment_t oldfs = get_fs();
-
-                if ((ret = get_raw32_request(&req, user_req)))
-                        return ret;
-
-                set_fs(KERNEL_DS);
-                ret = sys_ioctl(fd,cmd,(unsigned long)&req);
-                set_fs(oldfs);
-
-                if ((!ret) && (cmd == RAW_GETBIND)) {
-                        ret = set_raw32_request(&req, user_req);
-                }
-                break;
-        }
-        }
-        return ret;
-}
-#endif /* CONFIG_BLOCK */
 
 struct serial_struct32 {
         compat_int_t    type;
@@ -1262,9 +1199,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
 COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* Raw devices */
-COMPATIBLE_IOCTL(RAW_SETBIND)
-COMPATIBLE_IOCTL(RAW_GETBIND)
 /* SMB ioctls which do not need any translations */
 COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
 /* Watchdog */
@@ -1523,10 +1457,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case MTIOCGET32:
 	case MTIOCPOS32:
 		return mt_ioctl_trans(fd, cmd, argp);
-	/* Raw devices */
-	case RAW_SETBIND:
-	case RAW_GETBIND:
-		return raw_ioctl(fd, cmd, argp);
 #endif
 	/* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-- 
cgit v1.2.3


From 89f150f401c32b0a587dcb98d3bcfafe0b9c1c70 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Tue, 19 Oct 2010 11:47:52 -0500
Subject: Clean up two declarations of blob_len

- Eliminate double declaration of variable blob_len
- Modify function build_ntlmssp_auth_blob to return error code
  as well as length of the blob.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 2111bed71b1f..e35dc60d3255 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -464,6 +464,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
    maximum possible size is fixed and small, making this approach cleaner.
    This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+					u16 *buflen,
 				   struct cifsSesInfo *ses,
 				   const struct nls_table *nls_cp)
 {
@@ -558,7 +559,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->SessionKey.MaximumLength = 0;
 
 setup_ntlmv2_ret:
-	return tmp - pbuffer;
+	*buflen = tmp - pbuffer;
+	return rc;
 }
 
 
@@ -591,7 +593,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	int bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-	int blob_len;
+	u16 blob_len;
 	char *ntlmsspblob = NULL;
 
 	if (ses == NULL)
@@ -848,8 +850,10 @@ ssetup_ntlmssp_authenticate:
 					goto ssetup_exit;
 				}
 
-				blob_len = build_ntlmssp_auth_blob(ntlmsspblob,
-							ses, nls_cp);
+				rc = build_ntlmssp_auth_blob(ntlmsspblob,
+							&blob_len, ses, nls_cp);
+				if (rc)
+					goto ssetup_exit;
 				iov[1].iov_len = blob_len;
 				iov[1].iov_base = ntlmsspblob;
 				pSMB->req.SecurityBlobLength =
@@ -927,7 +931,6 @@ ssetup_ntlmssp_authenticate:
 	bcc_ptr = pByteArea(smb_buf);
 
 	if (smb_buf->WordCount == 4) {
-		__u16 blob_len;
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 		if (blob_len > bytes_remaining) {
 			cERROR(1, "bad security blob length %d", blob_len);
-- 
cgit v1.2.3


From 3e24e132878c83910b61eb7704511a6d96a0389f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Oct 2010 17:55:54 +0200
Subject: cifs: cancel_delayed_work() + flush_scheduled_work() ->
 cancel_delayed_work_sync()

flush_scheduled_work() is going away.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f4aab6f01174..c68a056f27fd 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
 void cifs_dfs_release_automount_timer(void)
 {
 	BUG_ON(!list_empty(&cifs_dfs_automount_list));
-	cancel_delayed_work(&cifs_dfs_automount_task);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&cifs_dfs_automount_task);
 }
 
 /**
-- 
cgit v1.2.3


From bc4866b6e0b44f8ea0df22a16e5927714beb4983 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Oct 2010 17:59:08 -0400
Subject: NFS: Don't SIGBUS if nfs_vm_page_mkwrite races with a cache
 invalidation

In the case where we lock the page, and then find out that the page has
been thrown out of the page cache, we should just return VM_FAULT_NOPAGE.
This is what block_page_mkwrite() does in these situations.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/file.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 59cbe1ba0511..39672b731736 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -551,7 +551,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
-	int ret = -EINVAL;
+	int ret = VM_FAULT_NOPAGE;
 	struct address_space *mapping;
 
 	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +567,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (mapping != dentry->d_inode->i_mapping)
 		goto out_unlock;
 
-	ret = 0;
 	pagelen = nfs_page_length(page);
 	if (pagelen == 0)
 		goto out_unlock;
 
-	ret = nfs_flush_incompatible(filp, page);
-	if (ret != 0)
-		goto out_unlock;
+	ret = VM_FAULT_LOCKED;
+	if (nfs_flush_incompatible(filp, page) == 0 &&
+	    nfs_updatepage(filp, page, 0, pagelen) == 0)
+		goto out;
 
-	ret = nfs_updatepage(filp, page, 0, pagelen);
+	ret = VM_FAULT_SIGBUS;
 out_unlock:
-	if (!ret)
-		return VM_FAULT_LOCKED;
 	unlock_page(page);
-	return VM_FAULT_SIGBUS;
+out:
+	return ret;
 }
 
 static const struct vm_operations_struct nfs_file_vm_ops = {
-- 
cgit v1.2.3


From b0ed9dbc24f1fd912b2dd08b995153cafc1d5b1c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Oct 2010 17:59:08 -0400
Subject: NFSv4: Fix open recovery

NFSv4 open recovery is currently broken: since we do not clear the
state->flags states before attempting recovery, we end up with the
'can_open_cached()' function triggering. This again leads to no OPEN call
being put on the wire.

Reported-by: Sachin Prabhu <sprabhu@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/nfs4proc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aee1bcc1fcc9..7f25246bf38c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1118,6 +1118,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 	clear_bit(NFS_DELEGATED_STATE, &state->flags);
 	smp_rmb();
 	if (state->n_rdwr != 0) {
+		clear_bit(NFS_O_RDWR_STATE, &state->flags);
 		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
 		if (ret != 0)
 			return ret;
@@ -1125,6 +1126,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 			return -ESTALE;
 	}
 	if (state->n_wronly != 0) {
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
 		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
 		if (ret != 0)
 			return ret;
@@ -1132,6 +1134,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 			return -ESTALE;
 	}
 	if (state->n_rdonly != 0) {
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
 		if (ret != 0)
 			return ret;
-- 
cgit v1.2.3


From ae1007d37e00144b72906a4bdc47d517ae91bcc1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Oct 2010 17:59:08 -0400
Subject: NFSv4: Don't call nfs4_state_mark_reclaim_reboot() from error
 handlers

In the case of a server reboot, the state recovery thread starts by calling
nfs4_state_end_reclaim_reboot() in order to avoid edge conditions when
the server reboots while the client is in the middle of recovery.

However, if the client has already marked the nfs4_state as requiring
reboot recovery, then the above behaviour will cause the recovery thread to
treat the open as if it was part of such an edge condition: the open will
be recovered as if it was part of a lease expiration (and all the locks
will be lost).
Fix is to remove the call to nfs4_state_mark_reclaim_reboot from
nfs4_async_handle_error(), and nfs4_handle_exception(). Instead we leave it
to the recovery thread to do this for us.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/nfs4proc.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7f25246bf38c..88a9d93ec1bd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -255,9 +255,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 			nfs4_state_mark_reclaim_nograce(clp, state);
 			goto do_state_recovery;
 		case -NFS4ERR_STALE_STATEID:
-			if (state == NULL)
-				break;
-			nfs4_state_mark_reclaim_reboot(clp, state);
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
 			goto do_state_recovery;
@@ -3414,9 +3411,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			nfs4_state_mark_reclaim_nograce(clp, state);
 			goto do_state_recovery;
 		case -NFS4ERR_STALE_STATEID:
-			if (state == NULL)
-				break;
-			nfs4_state_mark_reclaim_reboot(clp, state);
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
 			goto do_state_recovery;
-- 
cgit v1.2.3


From 6eaa61496fb3b93cceface7a296415fc4c030bce Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Oct 2010 17:59:08 -0400
Subject: NFSv4: Don't call nfs4_reclaim_complete() on receiving
 NFS4ERR_STALE_CLIENTID

If the server sends us an NFS4ERR_STALE_CLIENTID while the state management
thread is busy reclaiming state, we do want to treat all state that wasn't
reclaimed before the STALE_CLIENTID as if a network partition occurred (see
the edge conditions described in RFC3530 and RFC5661).
What we do not want to do is to send an nfs4_reclaim_complete(), since we
haven't yet even started reclaiming state after the server rebooted.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/nfs4state.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..940cf7c070af 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1138,16 +1138,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
 		(void)ops->reclaim_complete(clp);
 }
 
-static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 	struct nfs4_state *state;
 
 	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-		return;
-
-	nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+		return 0;
 
 	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
 		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1161,6 +1159,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 	}
 
 	nfs_delegation_reap_unclaimed(clp);
+	return 1;
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+	if (!nfs4_state_clear_reclaim_reboot(clp))
+		return;
+	nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
 }
 
 static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1187,7 +1193,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_LEASE_MOVED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-			nfs4_state_end_reclaim_reboot(clp);
+			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_EXPIRED:
-- 
cgit v1.2.3


From 7669a2c95e502a77f93f27e5449fc93a00d588b6 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Mon, 17 May 2010 12:31:35 -0700
Subject: ceph: lookup pool in osdmap by name

Implement a pool lookup by name.  This will be used by rbd.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 13 +++++++++++++
 fs/ceph/osdmap.h |  2 ++
 2 files changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index e31f118f1392..3ccd11761cb6 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -417,6 +417,19 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+	struct rb_node *rbp;
+
+	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rbp, struct ceph_pg_pool_info, node);
+		if (pi->name && strcmp(pi->name, name) == 0)
+			return pi->id;
+	}
+	return -ENOENT;
+}
+
 static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 {
 	rb_erase(&pi->node, root);
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 970b547e510d..a592b211be39 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+
 #endif
-- 
cgit v1.2.3


From 3499e8a5d4dbb083324efd942e2c4fb7eb65f27c Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 6 Apr 2010 14:51:47 -0700
Subject: ceph: refactor osdc requests creation functions

The osd requests creation are being decoupled from the
vino parameter, allowing clients using the osd to use
other arbitrary object names that are not necessarily
vino based. Also, calc_raw_layout now takes a snap id.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 187 +++++++++++++++++++++++++++++++++++----------------
 fs/ceph/osd_client.h |  25 +++++++
 2 files changed, 155 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3b5571b8ce22..2647dafd96f5 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -22,6 +22,35 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
 
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 len, u64 *bno,
+			struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_osd_op *op = (void *)(reqhead + 1);
+	u64 orig_len = len;
+	u64 objoff, objlen;    /* extent in object */
+
+	reqhead->snapid = cpu_to_le64(snapid);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, &len, bno,
+				      &objoff, &objlen);
+	if (len < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - len, off, len);
+
+	op->extent.offset = cpu_to_le64(objoff);
+	op->extent.length = cpu_to_le64(objlen);
+	req->r_num_pages = calc_pages_for(off, len);
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+
+}
+
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -48,34 +77,17 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
  * fill osd op in request message.
  */
 static void calc_layout(struct ceph_osd_client *osdc,
-			struct ceph_vino vino, struct ceph_file_layout *layout,
+			struct ceph_vino vino,
+			struct ceph_file_layout *layout,
 			u64 off, u64 *plen,
 			struct ceph_osd_request *req)
 {
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_osd_op *op = (void *)(reqhead + 1);
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
 	u64 bno;
 
-	reqhead->snapid = cpu_to_le64(vino.snap);
-
-	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, plen, &bno,
-				      &objoff, &objlen);
-	if (*plen < orig_len)
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
+	ceph_calc_raw_layout(osdc, layout, vino.snap, off, *plen, &bno, req);
 
 	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
-
-	op->extent.offset = cpu_to_le64(objoff);
-	op->extent.length = cpu_to_le64(objlen);
-	req->r_num_pages = calc_pages_for(off, *plen);
-
-	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
 }
 
 /*
@@ -108,43 +120,34 @@ void ceph_osdc_release_request(struct kref *kref)
 		kfree(req);
 }
 
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-					       struct ceph_file_layout *layout,
-					       struct ceph_vino vino,
-					       u64 off, u64 *plen,
-					       int opcode, int flags,
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
 					       struct ceph_snap_context *snapc,
 					       int do_sync,
-					       u32 truncate_seq,
-					       u64 truncate_size,
-					       struct timespec *mtime,
-					       bool use_mempool, int num_reply)
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	struct ceph_osd_request_head *head;
-	struct ceph_osd_op *op;
-	void *p;
 	int num_op = 1 + do_sync;
-	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-	int i;
+	size_t msg_size = sizeof(struct ceph_osd_request_head) +
+			  num_op*sizeof(struct ceph_osd_op);
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, gfp_flags);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), gfp_flags);
+	}
+	if (!req)
+		return NULL;
 
 	if (use_mempool) {
-		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+		req = mempool_alloc(osdc->req_mempool, gfp_flags);
 		memset(req, 0, sizeof(*req));
 	} else {
-		req = kzalloc(sizeof(*req), GFP_NOFS);
+		req = kzalloc(sizeof(*req), gfp_flags);
 	}
 	if (req == NULL)
 		return NULL;
@@ -164,7 +167,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
@@ -178,18 +181,48 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
 	if (!msg) {
 		ceph_osdc_put_request(req);
 		return NULL;
 	}
 	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+	req->r_request = msg;
+	req->r_pages = pages;
+
+	return req;
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+			    u64 off, u64 *plen,
+			    int opcode,
+			    struct ceph_snap_context *snapc,
+			    int do_sync,
+			    u32 truncate_seq,
+			    u64 truncate_size,
+			    struct timespec *mtime,
+			    const char *oid,
+			    int oid_len)
+{
+	struct ceph_msg *msg = req->r_request;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = 1 + do_sync;
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int i;
+	int flags = req->r_flags;
+
 	head = msg->front.iov_base;
 	op = (void *)(head + 1);
 	p = (void *)(op + num_op);
 
-	req->r_request = msg;
 	req->r_snapc = ceph_get_snap_context(snapc);
 
 	head->client_inc = cpu_to_le32(1); /* always, for now. */
@@ -199,10 +232,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	head->num_ops = cpu_to_le16(num_op);
 	op->op = cpu_to_le16(opcode);
 
-	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req);
-	req->r_file_layout = *layout;  /* keep a copy */
-
 	if (flags & CEPH_OSD_FLAG_WRITE) {
 		req->r_request->hdr.data_off = cpu_to_le16(off);
 		req->r_request->hdr.data_len = cpu_to_le32(*plen);
@@ -212,9 +241,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
 
 	/* fill in oid */
-	head->object_len = cpu_to_le32(req->r_oid_len);
-	memcpy(p, req->r_oid, req->r_oid_len);
-	p += req->r_oid_len;
+	head->object_len = cpu_to_le32(oid_len);
+	memcpy(p, oid, oid_len);
+	p += oid_len;
 
 	if (do_sync) {
 		op++;
@@ -233,6 +262,50 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	msg_size = p - msg->front.iov_base;
 	msg->front.iov_len = msg_size;
 	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return;
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_request *req =
+		ceph_osdc_alloc_request(osdc, flags,
+					 snapc, do_sync,
+					 use_mempool,
+					 GFP_NOFS, NULL);
+	if (IS_ERR(req))
+		return req;
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	ceph_osdc_build_request(req, off, plen, opcode,
+				snapc, do_sync,
+				truncate_seq, truncate_size,
+				mtime,
+				req->r_oid, req->r_oid_len);
+
 	return req;
 }
 
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index ce776989ef6a..b687c2ea72e6 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -119,6 +119,31 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
+extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 len, u64 *bno,
+			struct ceph_osd_request *req);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages);
+
+extern void ceph_osdc_build_request(struct ceph_osd_request *req,
+			    u64 off, u64 *plen,
+			    int opcode,
+			    struct ceph_snap_context *snapc,
+			    int do_sync,
+			    u32 truncate_seq,
+			    u64 truncate_size,
+			    struct timespec *mtime,
+			    const char *oid,
+			    int oid_len);
+
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
-- 
cgit v1.2.3


From 68b4476b0bc13fef18266b4140309a30e86739d2 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 6 Apr 2010 15:01:27 -0700
Subject: ceph: messenger and osdc changes for rbd

Allow the messenger to send/receive data in a bio.  This is added
so that we wouldn't need to copy the data into pages or some other buffer
when doing IO for an rbd block device.

We can now have trailing variable sized data for osd
ops.  Also osd ops encoding is more modular.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c  | 219 +++++++++++++++++++++++++++++++++++++-------
 fs/ceph/messenger.h  |   4 +
 fs/ceph/osd_client.c | 249 ++++++++++++++++++++++++++++++++++++++++-----------
 fs/ceph/osd_client.h |  61 ++++++++++---
 fs/ceph/pagelist.c   |   2 +-
 fs/ceph/pagelist.h   |   2 +-
 6 files changed, 436 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 2502d76fcec1..17a09b32a591 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <net/tcp.h>
 
 #include "super.h"
@@ -529,8 +531,11 @@ static void prepare_write_message(struct ceph_connection *con)
 	if (le32_to_cpu(m->hdr.data_len) > 0) {
 		/* initialize page iterator */
 		con->out_msg_pos.page = 0;
-		con->out_msg_pos.page_pos =
-			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		if (m->pages)
+			con->out_msg_pos.page_pos =
+				le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		else
+			con->out_msg_pos.page_pos = 0;
 		con->out_msg_pos.data_pos = 0;
 		con->out_msg_pos.did_page_crc = 0;
 		con->out_more = 1;  /* data + footer will follow */
@@ -712,6 +717,31 @@ out:
 	return ret;  /* done! */
 }
 
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+	if (!bio) {
+		*iter = NULL;
+		*seg = 0;
+		return;
+	}
+	*iter = bio;
+	*seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+	if (*bio_iter == NULL)
+		return;
+
+	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+	(*seg)++;
+	if (*seg == (*bio_iter)->bi_vcnt)
+		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
 /*
  * Write as much message data payload as we can.  If we finish, queue
  * up the footer.
@@ -726,21 +756,46 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	size_t len;
 	int crc = con->msgr->nocrc;
 	int ret;
+	int total_max_write;
+	int in_trail = 0;
+	size_t trail_len = (msg->trail ? msg->trail->length : 0);
 
 	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
 	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
 	     con->out_msg_pos.page_pos);
 
-	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+#ifdef CONFIG_BLOCK
+	if (msg->bio && !msg->bio_iter)
+		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+	while (data_len > con->out_msg_pos.data_pos) {
 		struct page *page = NULL;
 		void *kaddr = NULL;
+		int max_write = PAGE_SIZE;
+		int page_shift = 0;
+
+		total_max_write = data_len - trail_len -
+			con->out_msg_pos.data_pos;
 
 		/*
 		 * if we are calculating the data crc (the default), we need
 		 * to map the page.  if our pages[] has been revoked, use the
 		 * zero page.
 		 */
-		if (msg->pages) {
+
+		/* have we reached the trail part of the data? */
+		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+			in_trail = 1;
+
+			total_max_write = data_len - con->out_msg_pos.data_pos;
+
+			page = list_first_entry(&msg->trail->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+			max_write = PAGE_SIZE;
+		} else if (msg->pages) {
 			page = msg->pages[con->out_msg_pos.page];
 			if (crc)
 				kaddr = kmap(page);
@@ -749,13 +804,25 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 						struct page, lru);
 			if (crc)
 				kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+		} else if (msg->bio) {
+			struct bio_vec *bv;
+
+			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+			page = bv->bv_page;
+			page_shift = bv->bv_offset;
+			if (crc)
+				kaddr = kmap(page) + page_shift;
+			max_write = bv->bv_len;
+#endif
 		} else {
 			page = con->msgr->zero_page;
 			if (crc)
 				kaddr = page_address(con->msgr->zero_page);
 		}
-		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-			  (int)(data_len - con->out_msg_pos.data_pos));
+		len = min_t(int, max_write - con->out_msg_pos.page_pos,
+			    total_max_write);
+
 		if (crc && !con->out_msg_pos.did_page_crc) {
 			void *base = kaddr + con->out_msg_pos.page_pos;
 			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
@@ -765,13 +832,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 				cpu_to_le32(crc32c(tmpcrc, base, len));
 			con->out_msg_pos.did_page_crc = 1;
 		}
-
 		ret = kernel_sendpage(con->sock, page,
-				      con->out_msg_pos.page_pos, len,
+				      con->out_msg_pos.page_pos + page_shift,
+				      len,
 				      MSG_DONTWAIT | MSG_NOSIGNAL |
 				      MSG_MORE);
 
-		if (crc && (msg->pages || msg->pagelist))
+		if (crc &&
+		    (msg->pages || msg->pagelist || msg->bio || in_trail))
 			kunmap(page);
 
 		if (ret <= 0)
@@ -783,9 +851,16 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 			con->out_msg_pos.page_pos = 0;
 			con->out_msg_pos.page++;
 			con->out_msg_pos.did_page_crc = 0;
-			if (msg->pagelist)
+			if (in_trail)
+				list_move_tail(&page->lru,
+					       &msg->trail->head);
+			else if (msg->pagelist)
 				list_move_tail(&page->lru,
 					       &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+			else if (msg->bio)
+				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
 		}
 	}
 
@@ -1305,8 +1380,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 					struct kvec *section,
 					unsigned int sec_len, u32 *crc)
 {
-	int left;
-	int ret;
+	int ret, left;
 
 	BUG_ON(!section);
 
@@ -1329,13 +1403,83 @@ static int read_partial_message_section(struct ceph_connection *con,
 static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 				struct ceph_msg_header *hdr,
 				int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+				      struct page **pages,
+				      unsigned data_len, int datacrc)
+{
+	void *p;
+	int ret;
+	int left;
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+	/* (page) data */
+	BUG_ON(pages == NULL);
+	p = kmap(pages[con->in_msg_pos.page]);
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(pages[con->in_msg_pos.page]);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+		con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.page++;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+				    struct bio **bio_iter, int *bio_seg,
+				    unsigned data_len, int datacrc)
+{
+	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+	void *p;
+	int ret, left;
+
+	if (IS_ERR(bv))
+		return PTR_ERR(bv);
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+	p = kmap(bv->bv_page) + bv->bv_offset;
+
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(bv->bv_page);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == bv->bv_len) {
+		con->in_msg_pos.page_pos = 0;
+		iter_bio_next(bio_iter, bio_seg);
+	}
+
+	return ret;
+}
+#endif
+
 /*
  * read (part of) a message.
  */
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
-	void *p;
 	int ret;
 	int to, left;
 	unsigned front_len, middle_len, data_len, data_off;
@@ -1422,7 +1566,10 @@ static int read_partial_message(struct ceph_connection *con)
 			m->middle->vec.iov_len = 0;
 
 		con->in_msg_pos.page = 0;
-		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		if (m->pages)
+			con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		else
+			con->in_msg_pos.page_pos = 0;
 		con->in_msg_pos.data_pos = 0;
 	}
 
@@ -1440,27 +1587,29 @@ static int read_partial_message(struct ceph_connection *con)
 		if (ret <= 0)
 			return ret;
 	}
+#ifdef CONFIG_BLOCK
+	if (m->bio && !m->bio_iter)
+		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
 
 	/* (page) data */
 	while (con->in_msg_pos.data_pos < data_len) {
-		left = min((int)(data_len - con->in_msg_pos.data_pos),
-			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-		BUG_ON(m->pages == NULL);
-		p = kmap(m->pages[con->in_msg_pos.page]);
-		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-				       left);
-		if (ret > 0 && datacrc)
-			con->in_data_crc =
-				crc32c(con->in_data_crc,
-					  p + con->in_msg_pos.page_pos, ret);
-		kunmap(m->pages[con->in_msg_pos.page]);
-		if (ret <= 0)
-			return ret;
-		con->in_msg_pos.data_pos += ret;
-		con->in_msg_pos.page_pos += ret;
-		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-			con->in_msg_pos.page_pos = 0;
-			con->in_msg_pos.page++;
+		if (m->pages) {
+			ret = read_partial_message_pages(con, m->pages,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#ifdef CONFIG_BLOCK
+		} else if (m->bio) {
+
+			ret = read_partial_message_bio(con,
+						 &m->bio_iter, &m->bio_seg,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#endif
+		} else {
+			BUG_ON(1);
 		}
 	}
 
@@ -2136,6 +2285,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
 	m->nr_pages = 0;
 	m->pages = NULL;
 	m->pagelist = NULL;
+	m->bio = NULL;
+	m->bio_iter = NULL;
+	m->bio_seg = 0;
+	m->trail = NULL;
 
 	dout("ceph_msg_new %p front %d\n", m, front_len);
 	return m;
@@ -2250,6 +2403,8 @@ void ceph_msg_last_put(struct kref *kref)
 		m->pagelist = NULL;
 	}
 
+	m->trail = NULL;
+
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
 	else
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 76fbc957bc13..5a79450604ef 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -82,6 +82,10 @@ struct ceph_msg {
 	struct ceph_pagelist *pagelist; /* instead of pages */
 	struct list_head list_head;
 	struct kref kref;
+	struct bio  *bio;		/* instead of pages/pagelist */
+	struct bio  *bio_iter;		/* bio iterator */
+	int bio_seg;			/* current bio segment */
+	struct ceph_pagelist *trail;	/* the trailing part of the data */
 	bool front_is_vmalloc;
 	bool more_to_follow;
 	bool needs_out_seq;
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 2647dafd96f5..c5d818e73add 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -6,12 +6,16 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
 
 #include "super.h"
 #include "osd_client.h"
 #include "messenger.h"
 #include "decode.h"
 #include "auth.h"
+#include "pagelist.h"
 
 #define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
@@ -22,29 +26,50 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
 
+static int op_needs_trail(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_CALL:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int op_has_extent(int op)
+{
+	return (op == CEPH_OSD_OP_READ ||
+		op == CEPH_OSD_OP_WRITE);
+}
+
 void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
 			u64 snapid,
-			u64 off, u64 len, u64 *bno,
-			struct ceph_osd_request *req)
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_osd_op *op = (void *)(reqhead + 1);
-	u64 orig_len = len;
+	u64 orig_len = *plen;
 	u64 objoff, objlen;    /* extent in object */
 
 	reqhead->snapid = cpu_to_le64(snapid);
 
 	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, &len, bno,
+	ceph_calc_file_object_mapping(layout, off, plen, bno,
 				      &objoff, &objlen);
-	if (len < orig_len)
+	if (*plen < orig_len)
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - len, off, len);
+		     orig_len - *plen, off, *plen);
 
-	op->extent.offset = cpu_to_le64(objoff);
-	op->extent.length = cpu_to_le64(objlen);
-	req->r_num_pages = calc_pages_for(off, len);
+	if (op_has_extent(op->op)) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
 
 	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
 	     *bno, objoff, objlen, req->r_num_pages);
@@ -80,11 +105,13 @@ static void calc_layout(struct ceph_osd_client *osdc,
 			struct ceph_vino vino,
 			struct ceph_file_layout *layout,
 			u64 off, u64 *plen,
-			struct ceph_osd_request *req)
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
 {
 	u64 bno;
 
-	ceph_calc_raw_layout(osdc, layout, vino.snap, off, *plen, &bno, req);
+	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+			     plen, &bno, req, op);
 
 	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
@@ -113,35 +140,64 @@ void ceph_osdc_release_request(struct kref *kref)
 	if (req->r_own_pages)
 		ceph_release_page_vector(req->r_pages,
 					 req->r_num_pages);
+#ifdef CONFIG_BLOCK
+	if (req->r_bio)
+		bio_put(req->r_bio);
+#endif
 	ceph_put_snap_context(req->r_snapc);
+	if (req->r_trail) {
+		ceph_pagelist_release(req->r_trail);
+		kfree(req->r_trail);
+	}
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
 		kfree(req);
 }
 
+static int op_needs_trail(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+	int i = 0;
+
+	if (needs_trail)
+		*needs_trail = 0;
+	while (ops[i].op) {
+		if (needs_trail && op_needs_trail(ops[i].op))
+			*needs_trail = 1;
+		i++;
+	}
+
+	return i;
+}
+
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       int flags,
 					       struct ceph_snap_context *snapc,
-					       int do_sync,
+					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags,
-					       struct page **pages)
+					       struct page **pages,
+					       struct bio *bio)
 {
 	struct ceph_osd_request *req;
 	struct ceph_msg *msg;
-	int num_op = 1 + do_sync;
-	size_t msg_size = sizeof(struct ceph_osd_request_head) +
-			  num_op*sizeof(struct ceph_osd_op);
+	int needs_trail;
+	int num_op = get_num_ops(ops, &needs_trail);
+	size_t msg_size = sizeof(struct ceph_osd_request_head);
 
-	if (use_mempool) {
-		req = mempool_alloc(osdc->req_mempool, gfp_flags);
-		memset(req, 0, sizeof(*req));
-	} else {
-		req = kzalloc(sizeof(*req), gfp_flags);
-	}
-	if (!req)
-		return NULL;
+	msg_size += num_op*sizeof(struct ceph_osd_op);
 
 	if (use_mempool) {
 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
@@ -154,6 +210,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
+
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
@@ -174,6 +231,15 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
+	/* allocate space for the trailing data */
+	if (needs_trail) {
+		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+		if (!req->r_trail) {
+			ceph_osdc_put_request(req);
+			return NULL;
+		}
+		ceph_pagelist_init(req->r_trail);
+	}
 	/* create request message; allow space for oid */
 	msg_size += 40;
 	if (snapc)
@@ -186,38 +252,87 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 		ceph_osdc_put_request(req);
 		return NULL;
 	}
+
 	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
 
 	req->r_request = msg;
 	req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+	if (bio) {
+		req->r_bio = bio;
+		bio_get(req->r_bio);
+	}
+#endif
 
 	return req;
 }
 
+static void osd_req_encode_op(struct ceph_osd_request *req,
+			      struct ceph_osd_op *dst,
+			      struct ceph_osd_req_op *src)
+{
+	dst->op = cpu_to_le16(src->op);
+
+	switch (dst->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		dst->extent.offset =
+			cpu_to_le64(src->extent.offset);
+		dst->extent.length =
+			cpu_to_le64(src->extent.length);
+		dst->extent.truncate_size =
+			cpu_to_le64(src->extent.truncate_size);
+		dst->extent.truncate_seq =
+			cpu_to_le32(src->extent.truncate_seq);
+		break;
+
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+		BUG_ON(!req->r_trail);
+
+		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+		dst->xattr.cmp_op = src->xattr.cmp_op;
+		dst->xattr.cmp_mode = src->xattr.cmp_mode;
+		ceph_pagelist_append(req->r_trail, src->xattr.name,
+				     src->xattr.name_len);
+		ceph_pagelist_append(req->r_trail, src->xattr.val,
+				     src->xattr.value_len);
+		break;
+	case CEPH_OSD_OP_STARTSYNC:
+		break;
+	default:
+		pr_err("unrecognized osd opcode %d\n", dst->op);
+		WARN_ON(1);
+		break;
+	}
+	dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
 /*
  * build new request AND message
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			    u64 off, u64 *plen,
-			    int opcode,
-			    struct ceph_snap_context *snapc,
-			    int do_sync,
-			    u32 truncate_seq,
-			    u64 truncate_size,
-			    struct timespec *mtime,
-			    const char *oid,
-			    int oid_len)
+			     u64 off, u64 *plen,
+			     struct ceph_osd_req_op *src_ops,
+			     struct ceph_snap_context *snapc,
+			     struct timespec *mtime,
+			     const char *oid,
+			     int oid_len)
 {
 	struct ceph_msg *msg = req->r_request;
 	struct ceph_osd_request_head *head;
+	struct ceph_osd_req_op *src_op;
 	struct ceph_osd_op *op;
 	void *p;
-	int num_op = 1 + do_sync;
+	int num_op = get_num_ops(src_ops, NULL);
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-	int i;
 	int flags = req->r_flags;
+	u64 data_len = 0;
+	int i;
 
 	head = msg->front.iov_base;
 	op = (void *)(head + 1);
@@ -230,25 +345,23 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	if (flags & CEPH_OSD_FLAG_WRITE)
 		ceph_encode_timespec(&head->mtime, mtime);
 	head->num_ops = cpu_to_le16(num_op);
-	op->op = cpu_to_le16(opcode);
 
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(*plen);
-		op->payload_len = cpu_to_le32(*plen);
-	}
-	op->extent.truncate_size = cpu_to_le64(truncate_size);
-	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
 
 	/* fill in oid */
 	head->object_len = cpu_to_le32(oid_len);
 	memcpy(p, oid, oid_len);
 	p += oid_len;
 
-	if (do_sync) {
+	src_op = src_ops;
+	while (src_op->op) {
+		osd_req_encode_op(req, op, src_op);
+		src_op++;
 		op++;
-		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
 	}
+
+	if (req->r_trail)
+		data_len += req->r_trail->length;
+
 	if (snapc) {
 		head->snap_seq = cpu_to_le64(snapc->seq);
 		head->num_snaps = cpu_to_le32(snapc->num_snaps);
@@ -258,6 +371,14 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 		}
 	}
 
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+	} else if (data_len) {
+		req->r_request->hdr.data_off = 0;
+		req->r_request->hdr.data_len = cpu_to_le32(data_len);
+	}
+
 	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
 	msg_size = p - msg->front.iov_base;
 	msg->front.iov_len = msg_size;
@@ -288,21 +409,34 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       struct timespec *mtime,
 					       bool use_mempool, int num_reply)
 {
-	struct ceph_osd_request *req =
-		ceph_osdc_alloc_request(osdc, flags,
-					 snapc, do_sync,
+	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_request *req;
+
+	ops[0].op = opcode;
+	ops[0].extent.truncate_seq = truncate_seq;
+	ops[0].extent.truncate_size = truncate_size;
+	ops[0].payload_len = 0;
+
+	if (do_sync) {
+		ops[1].op = CEPH_OSD_OP_STARTSYNC;
+		ops[1].payload_len = 0;
+		ops[2].op = 0;
+	} else
+		ops[1].op = 0;
+
+	req = ceph_osdc_alloc_request(osdc, flags,
+					 snapc, ops,
 					 use_mempool,
-					 GFP_NOFS, NULL);
+					 GFP_NOFS, NULL, NULL);
 	if (IS_ERR(req))
 		return req;
 
 	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req);
+	calc_layout(osdc, vino, layout, off, plen, req, ops);
 	req->r_file_layout = *layout;  /* keep a copy */
 
-	ceph_osdc_build_request(req, off, plen, opcode,
-				snapc, do_sync,
-				truncate_seq, truncate_size,
+	ceph_osdc_build_request(req, off, plen, ops,
+				snapc,
 				mtime,
 				req->r_oid, req->r_oid_len);
 
@@ -1177,6 +1311,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 
 	req->r_request->pages = req->r_pages;
 	req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+	req->r_request->bio = req->r_bio;
+#endif
+	req->r_request->trail = req->r_trail;
 
 	register_request(osdc, req);
 
@@ -1493,6 +1631,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		}
 		m->pages = req->r_pages;
 		m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+		m->bio = req->r_bio;
+#endif
 	}
 	*skip = 0;
 	req->r_con_filling_msg = ceph_con_get(con);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index b687c2ea72e6..d583d1bf6cd9 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -15,6 +15,7 @@ struct ceph_snap_context;
 struct ceph_osd_request;
 struct ceph_osd_client;
 struct ceph_authorizer;
+struct ceph_pagelist;
 
 /*
  * completion callback for async writepages
@@ -80,6 +81,11 @@ struct ceph_osd_request {
 	struct page     **r_pages;            /* pages for data payload */
 	int               r_pages_from_pool;
 	int               r_own_pages;        /* if true, i own page list */
+#ifdef CONFIG_BLOCK
+	struct bio       *r_bio;	      /* instead of pages */
+#endif
+
+	struct ceph_pagelist *r_trail;	      /* trailing part of the data */
 };
 
 struct ceph_osd_client {
@@ -110,6 +116,36 @@ struct ceph_osd_client {
 	struct ceph_msgpool	msgpool_op_reply;
 };
 
+struct ceph_osd_req_op {
+	u16 op;           /* CEPH_OSD_OP_* */
+	u32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			u64 offset, length;
+			u64 truncate_size;
+			u32 truncate_seq;
+		} extent;
+		struct {
+			const char *name;
+			u32 name_len;
+			const char  *val;
+			u32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			u32 indata_len;
+		} cls;
+		struct {
+			u64 cookie, count;
+		} pgls;
+	};
+	u32 payload_len;
+};
+
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 			  struct ceph_client *client);
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
@@ -122,27 +158,26 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
 			u64 snapid,
-			u64 off, u64 len, u64 *bno,
-			struct ceph_osd_request *req);
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op);
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       int flags,
 					       struct ceph_snap_context *snapc,
-					       int do_sync,
+					       struct ceph_osd_req_op *ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags,
-					       struct page **pages);
+					       struct page **pages,
+					       struct bio *bio);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
-			    u64 off, u64 *plen,
-			    int opcode,
-			    struct ceph_snap_context *snapc,
-			    int do_sync,
-			    u32 truncate_seq,
-			    u64 truncate_size,
-			    struct timespec *mtime,
-			    const char *oid,
-			    int oid_len);
+				    u64 off, u64 *plen,
+				    struct ceph_osd_req_op *src_ops,
+				    struct ceph_snap_context *snapc,
+				    struct timespec *mtime,
+				    const char *oid,
+				    int oid_len);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 46a368b6dce5..326e1c04176f 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -39,7 +39,7 @@ static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 	return 0;
 }
 
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
 {
 	while (pl->room < len) {
 		size_t bit = pl->room;
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
index e8a4187e1087..cc9327aa1c98 100644
--- a/fs/ceph/pagelist.h
+++ b/fs/ceph/pagelist.h
@@ -19,7 +19,7 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
 }
 extern int ceph_pagelist_release(struct ceph_pagelist *pl);
 
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
 
 static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
 {
-- 
cgit v1.2.3


From ae1533b62b3369e6ae32338f4a77d64d0e88f676 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 18 May 2010 16:38:08 -0700
Subject: ceph-rbd: osdc support for osd call and rollback operations

This will be used for rbd snapshots administration.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/decode.h     |  5 +++++
 fs/ceph/osd_client.c | 18 ++++++++++++++++++
 fs/ceph/osd_client.h |  6 ++++++
 3 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 3d25415afe63..c5b6939fb32a 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end,
 		ceph_encode_need(p, end, n, bad);		\
 		ceph_encode_copy(p, pv, n);			\
 	} while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_string(p, end, s, n);		\
+	} while (0)
 
 
 #endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index c5d818e73add..0edb43f1ef26 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -161,6 +161,7 @@ static int op_needs_trail(int op)
 	case CEPH_OSD_OP_GETXATTR:
 	case CEPH_OSD_OP_SETXATTR:
 	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_CALL:
 		return 1;
 	default:
 		return 0;
@@ -301,6 +302,23 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		ceph_pagelist_append(req->r_trail, src->xattr.val,
 				     src->xattr.value_len);
 		break;
+	case CEPH_OSD_OP_CALL:
+		BUG_ON(!req->r_trail);
+
+		dst->cls.class_len = src->cls.class_len;
+		dst->cls.method_len = src->cls.method_len;
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+				     src->cls.class_len);
+		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+				     src->cls.method_len);
+		ceph_pagelist_append(req->r_trail, src->cls.indata,
+				     src->cls.indata_len);
+		break;
+	case CEPH_OSD_OP_ROLLBACK:
+		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
 	default:
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index d583d1bf6cd9..65c9c560f1ac 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -134,14 +134,20 @@ struct ceph_osd_req_op {
 			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
 		} xattr;
 		struct {
+			const char *class_name;
 			__u8 class_len;
+			const char *method_name;
 			__u8 method_len;
 			__u8 argc;
+			const char *indata;
 			u32 indata_len;
 		} cls;
 		struct {
 			u64 cookie, count;
 		} pgls;
+	        struct {
+		        u64 snapid;
+	        } snap;
 	};
 	u32 payload_len;
 };
-- 
cgit v1.2.3


From 3d14c5d2b6e15c21d8e5467dc62d33127c23a644 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 6 Apr 2010 15:14:15 -0700
Subject: ceph: factor out libceph from Ceph file system

This factors out protocol and low-level storage parts of ceph into a
separate libceph module living in net/ceph and include/linux/ceph.  This
is mostly a matter of moving files around.  However, a few key pieces
of the interface change as well:

 - ceph_client becomes ceph_fs_client and ceph_client, where the latter
   captures the mon and osd clients, and the fs_client gets the mds client
   and file system specific pieces.
 - Mount option parsing and debugfs setup is correspondingly broken into
   two pieces.
 - The mon client gets a generic handler callback for otherwise unknown
   messages (mds map, in this case).
 - The basic supported/required feature bits can be expanded (and are by
   ceph_fs_client).

No functional change, aside from some subtle error handling cases that got
cleaned up in the refactoring process.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 MAINTAINERS                     |    2 +
 fs/ceph/Kconfig                 |   14 +-
 fs/ceph/Makefile                |   11 +-
 fs/ceph/README                  |   20 -
 fs/ceph/addr.c                  |   65 +-
 fs/ceph/armor.c                 |  103 --
 fs/ceph/auth.c                  |  259 -----
 fs/ceph/auth.h                  |   92 --
 fs/ceph/auth_none.c             |  131 ---
 fs/ceph/auth_none.h             |   30 -
 fs/ceph/auth_x.c                |  687 -----------
 fs/ceph/auth_x.h                |   49 -
 fs/ceph/auth_x_protocol.h       |   90 --
 fs/ceph/buffer.c                |   65 --
 fs/ceph/buffer.h                |   39 -
 fs/ceph/caps.c                  |   35 +-
 fs/ceph/ceph_debug.h            |   37 -
 fs/ceph/ceph_frag.c             |    3 +-
 fs/ceph/ceph_frag.h             |  109 --
 fs/ceph/ceph_fs.c               |   72 --
 fs/ceph/ceph_fs.h               |  728 ------------
 fs/ceph/ceph_hash.c             |  118 --
 fs/ceph/ceph_hash.h             |   13 -
 fs/ceph/ceph_strings.c          |  193 ---
 fs/ceph/crush/crush.c           |  151 ---
 fs/ceph/crush/crush.h           |  180 ---
 fs/ceph/crush/hash.c            |  149 ---
 fs/ceph/crush/hash.h            |   17 -
 fs/ceph/crush/mapper.c          |  609 ----------
 fs/ceph/crush/mapper.h          |   20 -
 fs/ceph/crypto.c                |  412 -------
 fs/ceph/crypto.h                |   48 -
 fs/ceph/debugfs.c               |  407 ++-----
 fs/ceph/decode.h                |  201 ----
 fs/ceph/dir.c                   |   55 +-
 fs/ceph/export.c                |    5 +-
 fs/ceph/file.c                  |  207 +---
 fs/ceph/inode.c                 |   19 +-
 fs/ceph/ioctl.c                 |   11 +-
 fs/ceph/locks.c                 |    6 +-
 fs/ceph/mds_client.c            |   86 +-
 fs/ceph/mds_client.h            |   20 +-
 fs/ceph/mdsmap.c                |   11 +-
 fs/ceph/mdsmap.h                |   62 -
 fs/ceph/messenger.c             | 2432 --------------------------------------
 fs/ceph/messenger.h             |  257 ----
 fs/ceph/mon_client.c            | 1018 ----------------
 fs/ceph/mon_client.h            |  121 --
 fs/ceph/msgpool.c               |   64 -
 fs/ceph/msgpool.h               |   25 -
 fs/ceph/msgr.h                  |  175 ---
 fs/ceph/osd_client.c            | 1771 ----------------------------
 fs/ceph/osd_client.h            |  233 ----
 fs/ceph/osdmap.c                | 1123 ------------------
 fs/ceph/osdmap.h                |  130 ---
 fs/ceph/pagelist.c              |   63 -
 fs/ceph/pagelist.h              |   54 -
 fs/ceph/rados.h                 |  405 -------
 fs/ceph/snap.c                  |   10 +-
 fs/ceph/strings.c               |  117 ++
 fs/ceph/super.c                 | 1154 ++++++++----------
 fs/ceph/super.h                 |  397 +++----
 fs/ceph/types.h                 |   29 -
 fs/ceph/xattr.c                 |   15 +-
 include/linux/ceph/auth.h       |   92 ++
 include/linux/ceph/buffer.h     |   39 +
 include/linux/ceph/ceph_debug.h |   38 +
 include/linux/ceph/ceph_frag.h  |  109 ++
 include/linux/ceph/ceph_fs.h    |  728 ++++++++++++
 include/linux/ceph/ceph_hash.h  |   13 +
 include/linux/ceph/debugfs.h    |   33 +
 include/linux/ceph/decode.h     |  201 ++++
 include/linux/ceph/libceph.h    |  249 ++++
 include/linux/ceph/mdsmap.h     |   62 +
 include/linux/ceph/messenger.h  |  261 +++++
 include/linux/ceph/mon_client.h |  122 ++
 include/linux/ceph/msgpool.h    |   25 +
 include/linux/ceph/msgr.h       |  175 +++
 include/linux/ceph/osd_client.h |  234 ++++
 include/linux/ceph/osdmap.h     |  130 +++
 include/linux/ceph/pagelist.h   |   54 +
 include/linux/ceph/rados.h      |  405 +++++++
 include/linux/ceph/types.h      |   29 +
 include/linux/crush/crush.h     |  180 +++
 include/linux/crush/hash.h      |   17 +
 include/linux/crush/mapper.h    |   20 +
 net/Kconfig                     |    1 +
 net/Makefile                    |    1 +
 net/ceph/Kconfig                |   28 +
 net/ceph/Makefile               |   37 +
 net/ceph/armor.c                |  103 ++
 net/ceph/auth.c                 |  259 +++++
 net/ceph/auth_none.c            |  132 +++
 net/ceph/auth_none.h            |   29 +
 net/ceph/auth_x.c               |  688 +++++++++++
 net/ceph/auth_x.h               |   50 +
 net/ceph/auth_x_protocol.h      |   90 ++
 net/ceph/buffer.c               |   68 ++
 net/ceph/ceph_common.c          |  529 +++++++++
 net/ceph/ceph_fs.c              |   75 ++
 net/ceph/ceph_hash.c            |  118 ++
 net/ceph/ceph_strings.c         |   84 ++
 net/ceph/crush/crush.c          |  151 +++
 net/ceph/crush/hash.c           |  149 +++
 net/ceph/crush/mapper.c         |  609 ++++++++++
 net/ceph/crypto.c               |  412 +++++++
 net/ceph/crypto.h               |   48 +
 net/ceph/debugfs.c              |  268 +++++
 net/ceph/messenger.c            | 2453 +++++++++++++++++++++++++++++++++++++++
 net/ceph/mon_client.c           | 1027 ++++++++++++++++
 net/ceph/msgpool.c              |   64 +
 net/ceph/osd_client.c           | 1773 ++++++++++++++++++++++++++++
 net/ceph/osdmap.c               | 1128 ++++++++++++++++++
 net/ceph/pagelist.c             |   65 ++
 net/ceph/pagevec.c              |  223 ++++
 115 files changed, 14920 insertions(+), 14192 deletions(-)
 delete mode 100644 fs/ceph/README
 delete mode 100644 fs/ceph/armor.c
 delete mode 100644 fs/ceph/auth.c
 delete mode 100644 fs/ceph/auth.h
 delete mode 100644 fs/ceph/auth_none.c
 delete mode 100644 fs/ceph/auth_none.h
 delete mode 100644 fs/ceph/auth_x.c
 delete mode 100644 fs/ceph/auth_x.h
 delete mode 100644 fs/ceph/auth_x_protocol.h
 delete mode 100644 fs/ceph/buffer.c
 delete mode 100644 fs/ceph/buffer.h
 delete mode 100644 fs/ceph/ceph_debug.h
 delete mode 100644 fs/ceph/ceph_frag.h
 delete mode 100644 fs/ceph/ceph_fs.c
 delete mode 100644 fs/ceph/ceph_fs.h
 delete mode 100644 fs/ceph/ceph_hash.c
 delete mode 100644 fs/ceph/ceph_hash.h
 delete mode 100644 fs/ceph/ceph_strings.c
 delete mode 100644 fs/ceph/crush/crush.c
 delete mode 100644 fs/ceph/crush/crush.h
 delete mode 100644 fs/ceph/crush/hash.c
 delete mode 100644 fs/ceph/crush/hash.h
 delete mode 100644 fs/ceph/crush/mapper.c
 delete mode 100644 fs/ceph/crush/mapper.h
 delete mode 100644 fs/ceph/crypto.c
 delete mode 100644 fs/ceph/crypto.h
 delete mode 100644 fs/ceph/decode.h
 delete mode 100644 fs/ceph/mdsmap.h
 delete mode 100644 fs/ceph/messenger.c
 delete mode 100644 fs/ceph/messenger.h
 delete mode 100644 fs/ceph/mon_client.c
 delete mode 100644 fs/ceph/mon_client.h
 delete mode 100644 fs/ceph/msgpool.c
 delete mode 100644 fs/ceph/msgpool.h
 delete mode 100644 fs/ceph/msgr.h
 delete mode 100644 fs/ceph/osd_client.c
 delete mode 100644 fs/ceph/osd_client.h
 delete mode 100644 fs/ceph/osdmap.c
 delete mode 100644 fs/ceph/osdmap.h
 delete mode 100644 fs/ceph/pagelist.c
 delete mode 100644 fs/ceph/pagelist.h
 delete mode 100644 fs/ceph/rados.h
 create mode 100644 fs/ceph/strings.c
 delete mode 100644 fs/ceph/types.h
 create mode 100644 include/linux/ceph/auth.h
 create mode 100644 include/linux/ceph/buffer.h
 create mode 100644 include/linux/ceph/ceph_debug.h
 create mode 100644 include/linux/ceph/ceph_frag.h
 create mode 100644 include/linux/ceph/ceph_fs.h
 create mode 100644 include/linux/ceph/ceph_hash.h
 create mode 100644 include/linux/ceph/debugfs.h
 create mode 100644 include/linux/ceph/decode.h
 create mode 100644 include/linux/ceph/libceph.h
 create mode 100644 include/linux/ceph/mdsmap.h
 create mode 100644 include/linux/ceph/messenger.h
 create mode 100644 include/linux/ceph/mon_client.h
 create mode 100644 include/linux/ceph/msgpool.h
 create mode 100644 include/linux/ceph/msgr.h
 create mode 100644 include/linux/ceph/osd_client.h
 create mode 100644 include/linux/ceph/osdmap.h
 create mode 100644 include/linux/ceph/pagelist.h
 create mode 100644 include/linux/ceph/rados.h
 create mode 100644 include/linux/ceph/types.h
 create mode 100644 include/linux/crush/crush.h
 create mode 100644 include/linux/crush/hash.h
 create mode 100644 include/linux/crush/mapper.h
 create mode 100644 net/ceph/Kconfig
 create mode 100644 net/ceph/Makefile
 create mode 100644 net/ceph/armor.c
 create mode 100644 net/ceph/auth.c
 create mode 100644 net/ceph/auth_none.c
 create mode 100644 net/ceph/auth_none.h
 create mode 100644 net/ceph/auth_x.c
 create mode 100644 net/ceph/auth_x.h
 create mode 100644 net/ceph/auth_x_protocol.h
 create mode 100644 net/ceph/buffer.c
 create mode 100644 net/ceph/ceph_common.c
 create mode 100644 net/ceph/ceph_fs.c
 create mode 100644 net/ceph/ceph_hash.c
 create mode 100644 net/ceph/ceph_strings.c
 create mode 100644 net/ceph/crush/crush.c
 create mode 100644 net/ceph/crush/hash.c
 create mode 100644 net/ceph/crush/mapper.c
 create mode 100644 net/ceph/crypto.c
 create mode 100644 net/ceph/crypto.h
 create mode 100644 net/ceph/debugfs.c
 create mode 100644 net/ceph/messenger.c
 create mode 100644 net/ceph/mon_client.c
 create mode 100644 net/ceph/msgpool.c
 create mode 100644 net/ceph/osd_client.c
 create mode 100644 net/ceph/osdmap.c
 create mode 100644 net/ceph/pagelist.c
 create mode 100644 net/ceph/pagevec.c

(limited to 'fs')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7679bf32f7bb..48d05654d456 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 S:	Supported
 F:	Documentation/filesystems/ceph.txt
 F:	fs/ceph
+F:	net/ceph
+F:	include/linux/ceph
 
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 M:	David Vrabel <david.vrabel@csr.com>
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
 config CEPH_FS
         tristate "Ceph distributed file system (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
+	select CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
 	select CRYPTO
+	default n
 	help
 	  Choose Y or M here to include support for mounting the
 	  experimental Ceph distributed file system.  Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
 
 	  If unsure, say N.
 
-config CEPH_FS_PRETTYDEBUG
-	bool "Include file:line in ceph debug output"
-	depends on CEPH_FS
-	default n
-	help
-	  If you say Y here, debug output will include a filename and
-	  line to aid debugging.  This icnreases kernel size and slows
-	  execution slightly when debug call sites are enabled (e.g.,
-	  via CONFIG_DYNAMIC_DEBUG).
-
-	  If unsure, say N.
-
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
 
 ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	export.o caps.o snap.o xattr.o \
-	messenger.o msgpool.o buffer.o pagelist.o \
-	mds_client.o mdsmap.o \
-	mon_client.o \
-	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-	debugfs.o \
-	auth.o auth_none.o \
-	crypto.o armor.o \
-	auth_x.o \
-	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+	mds_client.o mdsmap.o strings.o ceph_frag.o \
+	debugfs.o
 
 else
 #Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
-src/include/msgr.h	    fs/ceph/msgr.h
-src/include/rados.h	    fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c	    fs/ceph/crush/crush.c
-src/crush/crush.h	    fs/ceph/crush/crush.h
-src/crush/mapper.c	    fs/ceph/crush/mapper.c
-src/crush/mapper.h	    fs/ceph/crush/mapper.h
-src/crush/hash.h	    fs/ceph/crush/hash.h
-src/crush/hash.c	    fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..51bcc5ce3230 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
@@ -10,7 +10,8 @@
 #include <linux/task_io_accounting_ops.h>
 
 #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
 
 /*
  * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	struct ceph_osd_client *osdc = 
+		&ceph_inode_to_client(inode)->client->osdc;
 	int err = 0;
 	u64 len = PAGE_CACHE_SIZE;
 
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
 	int rc = 0;
 	struct page **pages;
 	loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	struct ceph_osd_client *osdc;
 	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
 	int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
-	client = ceph_inode_to_client(inode);
-	osdc = &client->osdc;
+	fsc = ceph_inode_to_client(inode);
+	osdc = &fsc->client->osdc;
 
 	/* verify this is a writeable snap context */
 	snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
 	     inode, page, page->index, page_off, len, snapc);
 
-	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
 	if (writeback_stat >
-	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
-		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
 
 	set_page_writeback(page);
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct address_space *mapping = inode->i_mapping;
 	__s32 rc = -EIO;
 	u64 bytes = 0;
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
 		WARN_ON(!PageUptodate(page));
 
 		writeback_stat =
-			atomic_long_dec_return(&client->writeback_count);
+			atomic_long_dec_return(&fsc->writeback_count);
 		if (writeback_stat <
-		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
-			clear_bdi_congested(&client->backing_dev_info,
+		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+			clear_bdi_congested(&fsc->backing_dev_info,
 					    BLK_RW_ASYNC);
 
 		ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
  * mempool.  we avoid the mempool if we can because req->r_num_pages
  * may be less than the maximum write size.
  */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
 			   struct ceph_osd_request *req)
 {
 	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
 			       GFP_NOFS);
 	if (!req->r_pages) {
-		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
 		req->r_pages_from_pool = 1;
 		WARN_ON(!req->r_pages);
 	}
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	pgoff_t index, start, end;
 	int range_whole = 0;
 	int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	client = ceph_inode_to_client(inode);
-	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+	fsc = ceph_inode_to_client(inode);
+	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
 		pr_warning("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
-	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
-		wsize = client->mount_args->wsize;
+	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+		wsize = fsc->mount_options->wsize;
 	if (wsize < PAGE_CACHE_SIZE)
 		wsize = PAGE_CACHE_SIZE;
 	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
 				offset = (unsigned long long)page->index
 					<< PAGE_CACHE_SHIFT;
 				len = wsize;
-				req = ceph_osdc_new_request(&client->osdc,
+				req = ceph_osdc_new_request(&fsc->client->osdc,
 					    &ci->i_layout,
 					    ceph_vino(inode),
 					    offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
 					    &inode->i_mtime, true, 1);
 				max_pages = req->r_num_pages;
 
-				alloc_page_vec(client, req);
+				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
 			}
@@ -794,10 +797,10 @@ get_more_pages:
 			     inode, page, page->index);
 
 			writeback_stat =
-			       atomic_long_inc_return(&client->writeback_count);
+			       atomic_long_inc_return(&fsc->writeback_count);
 			if (writeback_stat > CONGESTION_ON_THRESH(
-				    client->mount_args->congestion_kb)) {
-				set_bdi_congested(&client->backing_dev_info,
+				    fsc->mount_options->congestion_kb)) {
+				set_bdi_congested(&fsc->backing_dev_info,
 						  BLK_RW_ASYNC);
 			}
 
@@ -846,7 +849,7 @@ get_more_pages:
 		op->payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
 
-		ceph_osdc_start_request(&client->osdc, req, true);
+		ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		req = NULL;
 
 		/* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_client *client = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
 
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page = vmf->page;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t off = page->index << PAGE_CACHE_SHIFT;
 	loff_t size, len;
 	int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
-
-#include <linux/errno.h>
-
-int ceph_armor(char *dst, const char *src, const char *end);
-int ceph_unarmor(char *dst, const char *src, const char *end);
-
-/*
- * base64 encode/decode.
- */
-
-static const char *pem_key =
-	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-static int encode_bits(int c)
-{
-	return pem_key[c];
-}
-
-static int decode_bits(char c)
-{
-	if (c >= 'A' && c <= 'Z')
-		return c - 'A';
-	if (c >= 'a' && c <= 'z')
-		return c - 'a' + 26;
-	if (c >= '0' && c <= '9')
-		return c - '0' + 52;
-	if (c == '+')
-		return 62;
-	if (c == '/')
-		return 63;
-	if (c == '=')
-		return 0; /* just non-negative, please */
-	return -EINVAL;
-}
-
-int ceph_armor(char *dst, const char *src, const char *end)
-{
-	int olen = 0;
-	int line = 0;
-
-	while (src < end) {
-		unsigned char a, b, c;
-
-		a = *src++;
-		*dst++ = encode_bits(a >> 2);
-		if (src < end) {
-			b = *src++;
-			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
-			if (src < end) {
-				c = *src++;
-				*dst++ = encode_bits(((b & 15) << 2) |
-						     (c >> 6));
-				*dst++ = encode_bits(c & 63);
-			} else {
-				*dst++ = encode_bits((b & 15) << 2);
-				*dst++ = '=';
-			}
-		} else {
-			*dst++ = encode_bits(((a & 3) << 4));
-			*dst++ = '=';
-			*dst++ = '=';
-		}
-		olen += 4;
-		line += 4;
-		if (line == 64) {
-			line = 0;
-			*(dst++) = '\n';
-			olen++;
-		}
-	}
-	return olen;
-}
-
-int ceph_unarmor(char *dst, const char *src, const char *end)
-{
-	int olen = 0;
-
-	while (src < end) {
-		int a, b, c, d;
-
-		if (src < end && src[0] == '\n')
-			src++;
-		if (src + 4 > end)
-			return -EINVAL;
-		a = decode_bits(src[0]);
-		b = decode_bits(src[1]);
-		c = decode_bits(src[2]);
-		d = decode_bits(src[3]);
-		if (a < 0 || b < 0 || c < 0 || d < 0)
-			return -EINVAL;
-
-		*dst++ = (a << 2) | (b >> 4);
-		if (src[2] == '=')
-			return olen + 1;
-		*dst++ = ((b & 15) << 4) | (c >> 2);
-		if (src[3] == '=')
-			return olen + 2;
-		*dst++ = ((c & 3) << 6) | d;
-		olen += 3;
-		src += 4;
-	}
-	return olen;
-}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include "types.h"
-#include "auth_none.h"
-#include "auth_x.h"
-#include "decode.h"
-#include "super.h"
-
-#include "messenger.h"
-
-/*
- * get protocol handler
- */
-static u32 supported_protocols[] = {
-	CEPH_AUTH_NONE,
-	CEPH_AUTH_CEPHX
-};
-
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
-{
-	switch (protocol) {
-	case CEPH_AUTH_NONE:
-		return ceph_auth_none_init(ac);
-	case CEPH_AUTH_CEPHX:
-		return ceph_x_init(ac);
-	default:
-		return -ENOENT;
-	}
-}
-
-/*
- * setup, teardown.
- */
-struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
-{
-	struct ceph_auth_client *ac;
-	int ret;
-
-	dout("auth_init name '%s' secret '%s'\n", name, secret);
-
-	ret = -ENOMEM;
-	ac = kzalloc(sizeof(*ac), GFP_NOFS);
-	if (!ac)
-		goto out;
-
-	ac->negotiating = true;
-	if (name)
-		ac->name = name;
-	else
-		ac->name = CEPH_AUTH_NAME_DEFAULT;
-	dout("auth_init name %s secret %s\n", ac->name, secret);
-	ac->secret = secret;
-	return ac;
-
-out:
-	return ERR_PTR(ret);
-}
-
-void ceph_auth_destroy(struct ceph_auth_client *ac)
-{
-	dout("auth_destroy %p\n", ac);
-	if (ac->ops)
-		ac->ops->destroy(ac);
-	kfree(ac);
-}
-
-/*
- * Reset occurs when reconnecting to the monitor.
- */
-void ceph_auth_reset(struct ceph_auth_client *ac)
-{
-	dout("auth_reset %p\n", ac);
-	if (ac->ops && !ac->negotiating)
-		ac->ops->reset(ac);
-	ac->negotiating = true;
-}
-
-int ceph_entity_name_encode(const char *name, void **p, void *end)
-{
-	int len = strlen(name);
-
-	if (*p + 2*sizeof(u32) + len > end)
-		return -ERANGE;
-	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
-	ceph_encode_32(p, len);
-	ceph_encode_copy(p, name, len);
-	return 0;
-}
-
-/*
- * Initiate protocol negotiation with monitor.  Include entity name
- * and list supported protocols.
- */
-int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
-{
-	struct ceph_mon_request_header *monhdr = buf;
-	void *p = monhdr + 1, *end = buf + len, *lenp;
-	int i, num;
-	int ret;
-
-	dout("auth_build_hello\n");
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, 0);  /* no protocol, yet */
-
-	lenp = p;
-	p += sizeof(u32);
-
-	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-	ceph_encode_8(&p, 1);
-	num = ARRAY_SIZE(supported_protocols);
-	ceph_encode_32(&p, num);
-	ceph_decode_need(&p, end, num * sizeof(u32), bad);
-	for (i = 0; i < num; i++)
-		ceph_encode_32(&p, supported_protocols[i]);
-
-	ret = ceph_entity_name_encode(ac->name, &p, end);
-	if (ret < 0)
-		return ret;
-	ceph_decode_need(&p, end, sizeof(u64), bad);
-	ceph_encode_64(&p, ac->global_id);
-
-	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-	return p - buf;
-
-bad:
-	return -ERANGE;
-}
-
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-				   void *msg_buf, size_t msg_len)
-{
-	struct ceph_mon_request_header *monhdr = msg_buf;
-	void *p = monhdr + 1;
-	void *end = msg_buf + msg_len;
-	int ret;
-
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, ac->protocol);
-
-	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
-	if (ret < 0) {
-		pr_err("error %d building auth method %s request\n", ret,
-		       ac->ops->name);
-		return ret;
-	}
-	dout(" built request %d bytes\n", ret);
-	ceph_encode_32(&p, ret);
-	return p + ret - msg_buf;
-}
-
-/*
- * Handle auth message from monitor.
- */
-int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-			   void *buf, size_t len,
-			   void *reply_buf, size_t reply_len)
-{
-	void *p = buf;
-	void *end = buf + len;
-	int protocol;
-	s32 result;
-	u64 global_id;
-	void *payload, *payload_end;
-	int payload_len;
-	char *result_msg;
-	int result_msg_len;
-	int ret = -EINVAL;
-
-	dout("handle_auth_reply %p %p\n", p, end);
-	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
-	protocol = ceph_decode_32(&p);
-	result = ceph_decode_32(&p);
-	global_id = ceph_decode_64(&p);
-	payload_len = ceph_decode_32(&p);
-	payload = p;
-	p += payload_len;
-	ceph_decode_need(&p, end, sizeof(u32), bad);
-	result_msg_len = ceph_decode_32(&p);
-	result_msg = p;
-	p += result_msg_len;
-	if (p != end)
-		goto bad;
-
-	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
-	     result_msg, global_id, payload_len);
-
-	payload_end = payload + payload_len;
-
-	if (global_id && ac->global_id != global_id) {
-		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-		ac->global_id = global_id;
-	}
-
-	if (ac->negotiating) {
-		/* server does not support our protocols? */
-		if (!protocol && result < 0) {
-			ret = result;
-			goto out;
-		}
-		/* set up (new) protocol handler? */
-		if (ac->protocol && ac->protocol != protocol) {
-			ac->ops->destroy(ac);
-			ac->protocol = 0;
-			ac->ops = NULL;
-		}
-		if (ac->protocol != protocol) {
-			ret = ceph_auth_init_protocol(ac, protocol);
-			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
-				goto out;
-			}
-		}
-
-		ac->negotiating = false;
-	}
-
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
-	if (ret == -EAGAIN) {
-		return ceph_build_auth_request(ac, reply_buf, reply_len);
-	} else if (ret) {
-		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-		return ret;
-	}
-	return 0;
-
-bad:
-	pr_err("failed to decode auth msg\n");
-out:
-	return ret;
-}
-
-int ceph_build_auth(struct ceph_auth_client *ac,
-		    void *msg_buf, size_t msg_len)
-{
-	if (!ac->protocol)
-		return ceph_auth_build_hello(ac, msg_buf, msg_len);
-	BUG_ON(!ac->ops);
-	if (ac->ops->should_authenticate(ac))
-		return ceph_build_auth_request(ac, msg_buf, msg_len);
-	return 0;
-}
-
-int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
-{
-	if (!ac->ops)
-		return 0;
-	return ac->ops->is_authenticated(ac);
-}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _FS_CEPH_AUTH_H
-#define _FS_CEPH_AUTH_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * Abstract interface for communicating with the authenticate module.
- * There is some handshake that takes place between us and the monitor
- * to acquire the necessary keys.  These are used to generate an
- * 'authorizer' that we use when connecting to a service (mds, osd).
- */
-
-struct ceph_auth_client;
-struct ceph_authorizer;
-
-struct ceph_auth_client_ops {
-	const char *name;
-
-	/*
-	 * true if we are authenticated and can connect to
-	 * services.
-	 */
-	int (*is_authenticated)(struct ceph_auth_client *ac);
-
-	/*
-	 * true if we should (re)authenticate, e.g., when our tickets
-	 * are getting old and crusty.
-	 */
-	int (*should_authenticate)(struct ceph_auth_client *ac);
-
-	/*
-	 * build requests and process replies during monitor
-	 * handshake.  if handle_reply returns -EAGAIN, we build
-	 * another request.
-	 */
-	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-	int (*handle_reply)(struct ceph_auth_client *ac, int result,
-			    void *buf, void *end);
-
-	/*
-	 * Create authorizer for connecting to a service, and verify
-	 * the response to authenticate the service.
-	 */
-	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
-				 struct ceph_authorizer **a,
-				 void **buf, size_t *len,
-				 void **reply_buf, size_t *reply_len);
-	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-				       struct ceph_authorizer *a, size_t len);
-	void (*destroy_authorizer)(struct ceph_auth_client *ac,
-				   struct ceph_authorizer *a);
-	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
-				      int peer_type);
-
-	/* reset when we (re)connect to a monitor */
-	void (*reset)(struct ceph_auth_client *ac);
-
-	void (*destroy)(struct ceph_auth_client *ac);
-};
-
-struct ceph_auth_client {
-	u32 protocol;           /* CEPH_AUTH_* */
-	void *private;          /* for use by protocol implementation */
-	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
-
-	bool negotiating;       /* true if negotiating protocol */
-	const char *name;       /* entity name */
-	u64 global_id;          /* our unique id in system */
-	const char *secret;     /* our secret key */
-	unsigned want_keys;     /* which services we want */
-};
-
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-					       const char *secret);
-extern void ceph_auth_destroy(struct ceph_auth_client *ac);
-
-extern void ceph_auth_reset(struct ceph_auth_client *ac);
-
-extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
-				 void *buf, size_t len);
-extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-				  void *buf, size_t len,
-				  void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
-
-extern int ceph_build_auth(struct ceph_auth_client *ac,
-		    void *msg_buf, size_t msg_len);
-
-extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-
-#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
-
-static void reset(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	xi->starting = true;
-	xi->built_authorizer = false;
-}
-
-static void destroy(struct ceph_auth_client *ac)
-{
-	kfree(ac->private);
-	ac->private = NULL;
-}
-
-static int is_authenticated(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	return !xi->starting;
-}
-
-static int should_authenticate(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	return xi->starting;
-}
-
-/*
- * the generic auth code decode the global_id, and we carry no actual
- * authenticate state, so nothing happens here.
- */
-static int handle_reply(struct ceph_auth_client *ac, int result,
-			void *buf, void *end)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	xi->starting = false;
-	return result;
-}
-
-/*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
- */
-static int ceph_auth_none_create_authorizer(
-	struct ceph_auth_client *ac, int peer_type,
-	struct ceph_authorizer **a,
-	void **buf, size_t *len,
-	void **reply_buf, size_t *reply_len)
-{
-	struct ceph_auth_none_info *ai = ac->private;
-	struct ceph_none_authorizer *au = &ai->au;
-	void *p, *end;
-	int ret;
-
-	if (!ai->built_authorizer) {
-		p = au->buf;
-		end = p + sizeof(au->buf);
-		ceph_encode_8(&p, 1);
-		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-		if (ret < 0)
-			goto bad;
-		ceph_decode_need(&p, end, sizeof(u64), bad2);
-		ceph_encode_64(&p, ac->global_id);
-		au->buf_len = p - (void *)au->buf;
-		ai->built_authorizer = true;
-		dout("built authorizer len %d\n", au->buf_len);
-	}
-
-	*a = (struct ceph_authorizer *)au;
-	*buf = au->buf;
-	*len = au->buf_len;
-	*reply_buf = au->reply_buf;
-	*reply_len = sizeof(au->reply_buf);
-	return 0;
-
-bad2:
-	ret = -ERANGE;
-bad:
-	return ret;
-}
-
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	/* nothing to do */
-}
-
-static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-	.name = "none",
-	.reset = reset,
-	.destroy = destroy,
-	.is_authenticated = is_authenticated,
-	.should_authenticate = should_authenticate,
-	.handle_reply = handle_reply,
-	.create_authorizer = ceph_auth_none_create_authorizer,
-	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
-};
-
-int ceph_auth_none_init(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi;
-
-	dout("ceph_auth_none_init %p\n", ac);
-	xi = kzalloc(sizeof(*xi), GFP_NOFS);
-	if (!xi)
-		return -ENOMEM;
-
-	xi->starting = true;
-	xi->built_authorizer = false;
-
-	ac->protocol = CEPH_AUTH_NONE;
-	ac->private = xi;
-	ac->ops = &ceph_auth_none_ops;
-	return 0;
-}
-
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FS_CEPH_AUTH_NONE_H
-#define _FS_CEPH_AUTH_NONE_H
-
-#include <linux/slab.h>
-
-#include "auth.h"
-
-/*
- * null security mode.
- *
- * we use a single static authorizer that simply encodes our entity name
- * and global id.
- */
-
-struct ceph_none_authorizer {
-	char buf[128];
-	int buf_len;
-	char reply_buf[0];
-};
-
-struct ceph_auth_none_info {
-	bool starting;
-	bool built_authorizer;
-	struct ceph_none_authorizer au;   /* we only need one; it's static */
-};
-
-extern int ceph_auth_none_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_x.h"
-#include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
-
-#define TEMP_TICKET_BUF_LEN	256
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
-
-static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-	int need;
-
-	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return (ac->want_keys & xi->have_keys) == ac->want_keys;
-}
-
-static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-	int need;
-
-	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return need != 0;
-}
-
-static int ceph_x_encrypt_buflen(int ilen)
-{
-	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
-		sizeof(u32);
-}
-
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
-			  void *ibuf, int ilen, void *obuf, size_t olen)
-{
-	struct ceph_x_encrypt_header head = {
-		.struct_v = 1,
-		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
-	};
-	size_t len = olen - sizeof(u32);
-	int ret;
-
-	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
-			    &head, sizeof(head), ibuf, ilen);
-	if (ret)
-		return ret;
-	ceph_encode_32(&obuf, len);
-	return len + sizeof(u32);
-}
-
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
-			  void **p, void *end, void *obuf, size_t olen)
-{
-	struct ceph_x_encrypt_header head;
-	size_t head_len = sizeof(head);
-	int len, ret;
-
-	len = ceph_decode_32(p);
-	if (*p + len > end)
-		return -EINVAL;
-
-	dout("ceph_x_decrypt len %d\n", len);
-	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
-			    *p, len);
-	if (ret)
-		return ret;
-	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
-		return -EPERM;
-	*p += len;
-	return olen;
-}
-
-/*
- * get existing (or insert new) ticket handler
- */
-static struct ceph_x_ticket_handler *
-get_ticket_handler(struct ceph_auth_client *ac, int service)
-{
-	struct ceph_x_ticket_handler *th;
-	struct ceph_x_info *xi = ac->private;
-	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
-
-	while (*p) {
-		parent = *p;
-		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
-		if (service < th->service)
-			p = &(*p)->rb_left;
-		else if (service > th->service)
-			p = &(*p)->rb_right;
-		else
-			return th;
-	}
-
-	/* add it */
-	th = kzalloc(sizeof(*th), GFP_NOFS);
-	if (!th)
-		return ERR_PTR(-ENOMEM);
-	th->service = service;
-	rb_link_node(&th->node, parent, p);
-	rb_insert_color(&th->node, &xi->ticket_handlers);
-	return th;
-}
-
-static void remove_ticket_handler(struct ceph_auth_client *ac,
-				  struct ceph_x_ticket_handler *th)
-{
-	struct ceph_x_info *xi = ac->private;
-
-	dout("remove_ticket_handler %p %d\n", th, th->service);
-	rb_erase(&th->node, &xi->ticket_handlers);
-	ceph_crypto_key_destroy(&th->session_key);
-	if (th->ticket_blob)
-		ceph_buffer_put(th->ticket_blob);
-	kfree(th);
-}
-
-static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
-				    struct ceph_crypto_key *secret,
-				    void *buf, void *end)
-{
-	struct ceph_x_info *xi = ac->private;
-	int num;
-	void *p = buf;
-	int ret;
-	char *dbuf;
-	char *ticket_buf;
-	u8 reply_struct_v;
-
-	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-	if (!dbuf)
-		return -ENOMEM;
-
-	ret = -ENOMEM;
-	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-	if (!ticket_buf)
-		goto out_dbuf;
-
-	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-	reply_struct_v = ceph_decode_8(&p);
-	if (reply_struct_v != 1)
-		goto bad;
-	num = ceph_decode_32(&p);
-	dout("%d tickets\n", num);
-	while (num--) {
-		int type;
-		u8 tkt_struct_v, blob_struct_v;
-		struct ceph_x_ticket_handler *th;
-		void *dp, *dend;
-		int dlen;
-		char is_enc;
-		struct timespec validity;
-		struct ceph_crypto_key old_key;
-		void *tp, *tpend;
-		struct ceph_timespec new_validity;
-		struct ceph_crypto_key new_session_key;
-		struct ceph_buffer *new_ticket_blob;
-		unsigned long new_expires, new_renew_after;
-		u64 new_secret_id;
-
-		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
-
-		type = ceph_decode_32(&p);
-		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-
-		tkt_struct_v = ceph_decode_8(&p);
-		if (tkt_struct_v != 1)
-			goto bad;
-
-		th = get_ticket_handler(ac, type);
-		if (IS_ERR(th)) {
-			ret = PTR_ERR(th);
-			goto out;
-		}
-
-		/* blob for me */
-		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
-				      TEMP_TICKET_BUF_LEN);
-		if (dlen <= 0) {
-			ret = dlen;
-			goto out;
-		}
-		dout(" decrypted %d bytes\n", dlen);
-		dend = dbuf + dlen;
-		dp = dbuf;
-
-		tkt_struct_v = ceph_decode_8(&dp);
-		if (tkt_struct_v != 1)
-			goto bad;
-
-		memcpy(&old_key, &th->session_key, sizeof(old_key));
-		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
-		if (ret)
-			goto out;
-
-		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-		ceph_decode_timespec(&validity, &new_validity);
-		new_expires = get_seconds() + validity.tv_sec;
-		new_renew_after = new_expires - (validity.tv_sec / 4);
-		dout(" expires=%lu renew_after=%lu\n", new_expires,
-		     new_renew_after);
-
-		/* ticket blob for service */
-		ceph_decode_8_safe(&p, end, is_enc, bad);
-		tp = ticket_buf;
-		if (is_enc) {
-			/* encrypted */
-			dout(" encrypted ticket\n");
-			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
-					      TEMP_TICKET_BUF_LEN);
-			if (dlen < 0) {
-				ret = dlen;
-				goto out;
-			}
-			dlen = ceph_decode_32(&tp);
-		} else {
-			/* unencrypted */
-			ceph_decode_32_safe(&p, end, dlen, bad);
-			ceph_decode_need(&p, end, dlen, bad);
-			ceph_decode_copy(&p, ticket_buf, dlen);
-		}
-		tpend = tp + dlen;
-		dout(" ticket blob is %d bytes\n", dlen);
-		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-		blob_struct_v = ceph_decode_8(&tp);
-		new_secret_id = ceph_decode_64(&tp);
-		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
-		if (ret)
-			goto out;
-
-		/* all is well, update our ticket */
-		ceph_crypto_key_destroy(&th->session_key);
-		if (th->ticket_blob)
-			ceph_buffer_put(th->ticket_blob);
-		th->session_key = new_session_key;
-		th->ticket_blob = new_ticket_blob;
-		th->validity = new_validity;
-		th->secret_id = new_secret_id;
-		th->expires = new_expires;
-		th->renew_after = new_renew_after;
-		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
-		     type, ceph_entity_type_name(type), th->secret_id,
-		     (int)th->ticket_blob->vec.iov_len);
-		xi->have_keys |= th->service;
-	}
-
-	ret = 0;
-out:
-	kfree(ticket_buf);
-out_dbuf:
-	kfree(dbuf);
-	return ret;
-
-bad:
-	ret = -EINVAL;
-	goto out;
-}
-
-static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
-				   struct ceph_x_ticket_handler *th,
-				   struct ceph_x_authorizer *au)
-{
-	int maxlen;
-	struct ceph_x_authorize_a *msg_a;
-	struct ceph_x_authorize_b msg_b;
-	void *p, *end;
-	int ret;
-	int ticket_blob_len =
-		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
-
-	dout("build_authorizer for %s %p\n",
-	     ceph_entity_type_name(th->service), au);
-
-	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
-		ceph_x_encrypt_buflen(ticket_blob_len);
-	dout("  need len %d\n", maxlen);
-	if (au->buf && au->buf->alloc_len < maxlen) {
-		ceph_buffer_put(au->buf);
-		au->buf = NULL;
-	}
-	if (!au->buf) {
-		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
-		if (!au->buf)
-			return -ENOMEM;
-	}
-	au->service = th->service;
-
-	msg_a = au->buf->vec.iov_base;
-	msg_a->struct_v = 1;
-	msg_a->global_id = cpu_to_le64(ac->global_id);
-	msg_a->service_id = cpu_to_le32(th->service);
-	msg_a->ticket_blob.struct_v = 1;
-	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
-	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
-	if (ticket_blob_len) {
-		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
-		       th->ticket_blob->vec.iov_len);
-	}
-	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
-	     le64_to_cpu(msg_a->ticket_blob.secret_id));
-
-	p = msg_a + 1;
-	p += ticket_blob_len;
-	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-
-	get_random_bytes(&au->nonce, sizeof(au->nonce));
-	msg_b.struct_v = 1;
-	msg_b.nonce = cpu_to_le64(au->nonce);
-	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
-			     p, end - p);
-	if (ret < 0)
-		goto out_buf;
-	p += ret;
-	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
-	dout(" built authorizer nonce %llx len %d\n", au->nonce,
-	     (int)au->buf->vec.iov_len);
-	BUG_ON(au->buf->vec.iov_len > maxlen);
-	return 0;
-
-out_buf:
-	ceph_buffer_put(au->buf);
-	au->buf = NULL;
-	return ret;
-}
-
-static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
-				void **p, void *end)
-{
-	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
-	ceph_encode_8(p, 1);
-	ceph_encode_64(p, th->secret_id);
-	if (th->ticket_blob) {
-		const char *buf = th->ticket_blob->vec.iov_base;
-		u32 len = th->ticket_blob->vec.iov_len;
-
-		ceph_encode_32_safe(p, end, len, bad);
-		ceph_encode_copy_safe(p, end, buf, len, bad);
-	} else {
-		ceph_encode_32_safe(p, end, 0, bad);
-	}
-
-	return 0;
-bad:
-	return -ERANGE;
-}
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
-{
-	int want = ac->want_keys;
-	struct ceph_x_info *xi = ac->private;
-	int service;
-
-	*pneed = ac->want_keys & ~(xi->have_keys);
-
-	for (service = 1; service <= want; service <<= 1) {
-		struct ceph_x_ticket_handler *th;
-
-		if (!(ac->want_keys & service))
-			continue;
-
-		if (*pneed & service)
-			continue;
-
-		th = get_ticket_handler(ac, service);
-
-		if (IS_ERR(th)) {
-			*pneed |= service;
-			continue;
-		}
-
-		if (get_seconds() >= th->renew_after)
-			*pneed |= service;
-		if (get_seconds() >= th->expires)
-			xi->have_keys &= ~service;
-	}
-}
-
-
-static int ceph_x_build_request(struct ceph_auth_client *ac,
-				void *buf, void *end)
-{
-	struct ceph_x_info *xi = ac->private;
-	int need;
-	struct ceph_x_request_header *head = buf;
-	int ret;
-	struct ceph_x_ticket_handler *th =
-		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-
-	if (IS_ERR(th))
-		return PTR_ERR(th);
-
-	ceph_x_validate_tickets(ac, &need);
-
-	dout("build_request want %x have %x need %x\n",
-	     ac->want_keys, xi->have_keys, need);
-
-	if (need & CEPH_ENTITY_TYPE_AUTH) {
-		struct ceph_x_authenticate *auth = (void *)(head + 1);
-		void *p = auth + 1;
-		struct ceph_x_challenge_blob tmp;
-		char tmp_enc[40];
-		u64 *u;
-
-		if (p > end)
-			return -ERANGE;
-
-		dout(" get_auth_session_key\n");
-		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
-
-		/* encrypt and hash */
-		get_random_bytes(&auth->client_challenge, sizeof(u64));
-		tmp.client_challenge = auth->client_challenge;
-		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
-		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
-				     tmp_enc, sizeof(tmp_enc));
-		if (ret < 0)
-			return ret;
-
-		auth->struct_v = 1;
-		auth->key = 0;
-		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-			auth->key ^= *(__le64 *)u;
-		dout(" server_challenge %llx client_challenge %llx key %llx\n",
-		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
-		     le64_to_cpu(auth->key));
-
-		/* now encode the old ticket if exists */
-		ret = ceph_x_encode_ticket(th, &p, end);
-		if (ret < 0)
-			return ret;
-
-		return p - buf;
-	}
-
-	if (need) {
-		void *p = head + 1;
-		struct ceph_x_service_ticket_request *req;
-
-		if (p > end)
-			return -ERANGE;
-		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
-		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
-		if (ret)
-			return ret;
-		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-				 xi->auth_authorizer.buf->vec.iov_len);
-
-		req = p;
-		req->keys = cpu_to_le32(need);
-		p += sizeof(*req);
-		return p - buf;
-	}
-
-	return 0;
-}
-
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-			       void *buf, void *end)
-{
-	struct ceph_x_info *xi = ac->private;
-	struct ceph_x_reply_header *head = buf;
-	struct ceph_x_ticket_handler *th;
-	int len = end - buf;
-	int op;
-	int ret;
-
-	if (result)
-		return result;  /* XXX hmm? */
-
-	if (xi->starting) {
-		/* it's a hello */
-		struct ceph_x_server_challenge *sc = buf;
-
-		if (len != sizeof(*sc))
-			return -EINVAL;
-		xi->server_challenge = le64_to_cpu(sc->server_challenge);
-		dout("handle_reply got server challenge %llx\n",
-		     xi->server_challenge);
-		xi->starting = false;
-		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
-		return -EAGAIN;
-	}
-
-	op = le16_to_cpu(head->op);
-	result = le32_to_cpu(head->result);
-	dout("handle_reply op %d result %d\n", op, result);
-	switch (op) {
-	case CEPHX_GET_AUTH_SESSION_KEY:
-		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-					       buf + sizeof(*head), end);
-		break;
-
-	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
-		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-		if (IS_ERR(th))
-			return PTR_ERR(th);
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-					       buf + sizeof(*head), end);
-		break;
-
-	default:
-		return -EINVAL;
-	}
-	if (ret)
-		return ret;
-	if (ac->want_keys == xi->have_keys)
-		return 0;
-	return -EAGAIN;
-}
-
-static int ceph_x_create_authorizer(
-	struct ceph_auth_client *ac, int peer_type,
-	struct ceph_authorizer **a,
-	void **buf, size_t *len,
-	void **reply_buf, size_t *reply_len)
-{
-	struct ceph_x_authorizer *au;
-	struct ceph_x_ticket_handler *th;
-	int ret;
-
-	th = get_ticket_handler(ac, peer_type);
-	if (IS_ERR(th))
-		return PTR_ERR(th);
-
-	au = kzalloc(sizeof(*au), GFP_NOFS);
-	if (!au)
-		return -ENOMEM;
-
-	ret = ceph_x_build_authorizer(ac, th, au);
-	if (ret) {
-		kfree(au);
-		return ret;
-	}
-
-	*a = (struct ceph_authorizer *)au;
-	*buf = au->buf->vec.iov_base;
-	*len = au->buf->vec.iov_len;
-	*reply_buf = au->reply_buf;
-	*reply_len = sizeof(au->reply_buf);
-	return 0;
-}
-
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-					  struct ceph_authorizer *a, size_t len)
-{
-	struct ceph_x_authorizer *au = (void *)a;
-	struct ceph_x_ticket_handler *th;
-	int ret = 0;
-	struct ceph_x_authorize_reply reply;
-	void *p = au->reply_buf;
-	void *end = p + sizeof(au->reply_buf);
-
-	th = get_ticket_handler(ac, au->service);
-	if (IS_ERR(th))
-		return PTR_ERR(th);
-	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
-	if (ret < 0)
-		return ret;
-	if (ret != sizeof(reply))
-		return -EPERM;
-
-	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
-		ret = -EPERM;
-	else
-		ret = 0;
-	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
-	return ret;
-}
-
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	struct ceph_x_authorizer *au = (void *)a;
-
-	ceph_buffer_put(au->buf);
-	kfree(au);
-}
-
-
-static void ceph_x_reset(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-
-	dout("reset\n");
-	xi->starting = true;
-	xi->server_challenge = 0;
-}
-
-static void ceph_x_destroy(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-	struct rb_node *p;
-
-	dout("ceph_x_destroy %p\n", ac);
-	ceph_crypto_key_destroy(&xi->secret);
-
-	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
-		struct ceph_x_ticket_handler *th =
-			rb_entry(p, struct ceph_x_ticket_handler, node);
-		remove_ticket_handler(ac, th);
-	}
-
-	if (xi->auth_authorizer.buf)
-		ceph_buffer_put(xi->auth_authorizer.buf);
-
-	kfree(ac->private);
-	ac->private = NULL;
-}
-
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-				   int peer_type)
-{
-	struct ceph_x_ticket_handler *th;
-
-	th = get_ticket_handler(ac, peer_type);
-	if (!IS_ERR(th))
-		remove_ticket_handler(ac, th);
-}
-
-
-static const struct ceph_auth_client_ops ceph_x_ops = {
-	.name = "x",
-	.is_authenticated = ceph_x_is_authenticated,
-	.should_authenticate = ceph_x_should_authenticate,
-	.build_request = ceph_x_build_request,
-	.handle_reply = ceph_x_handle_reply,
-	.create_authorizer = ceph_x_create_authorizer,
-	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-	.destroy_authorizer = ceph_x_destroy_authorizer,
-	.invalidate_authorizer = ceph_x_invalidate_authorizer,
-	.reset =  ceph_x_reset,
-	.destroy = ceph_x_destroy,
-};
-
-
-int ceph_x_init(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi;
-	int ret;
-
-	dout("ceph_x_init %p\n", ac);
-	ret = -ENOMEM;
-	xi = kzalloc(sizeof(*xi), GFP_NOFS);
-	if (!xi)
-		goto out;
-
-	ret = -EINVAL;
-	if (!ac->secret) {
-		pr_err("no secret set (for auth_x protocol)\n");
-		goto out_nomem;
-	}
-
-	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
-	if (ret)
-		goto out_nomem;
-
-	xi->starting = true;
-	xi->ticket_handlers = RB_ROOT;
-
-	ac->protocol = CEPH_AUTH_CEPHX;
-	ac->private = xi;
-	ac->ops = &ceph_x_ops;
-	return 0;
-
-out_nomem:
-	kfree(xi);
-out:
-	return ret;
-}
-
-
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _FS_CEPH_AUTH_X_H
-#define _FS_CEPH_AUTH_X_H
-
-#include <linux/rbtree.h>
-
-#include "crypto.h"
-#include "auth.h"
-#include "auth_x_protocol.h"
-
-/*
- * Handle ticket for a single service.
- */
-struct ceph_x_ticket_handler {
-	struct rb_node node;
-	unsigned service;
-
-	struct ceph_crypto_key session_key;
-	struct ceph_timespec validity;
-
-	u64 secret_id;
-	struct ceph_buffer *ticket_blob;
-
-	unsigned long renew_after, expires;
-};
-
-
-struct ceph_x_authorizer {
-	struct ceph_buffer *buf;
-	unsigned service;
-	u64 nonce;
-	char reply_buf[128];  /* big enough for encrypted blob */
-};
-
-struct ceph_x_info {
-	struct ceph_crypto_key secret;
-
-	bool starting;
-	u64 server_challenge;
-
-	unsigned have_keys;
-	struct rb_root ticket_handlers;
-
-	struct ceph_x_authorizer auth_authorizer;
-};
-
-extern int ceph_x_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __FS_CEPH_AUTH_X_PROTOCOL
-#define __FS_CEPH_AUTH_X_PROTOCOL
-
-#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
-#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
-#define CEPHX_GET_ROTATING_KEY          0x0400
-
-/* common bits */
-struct ceph_x_ticket_blob {
-	__u8 struct_v;
-	__le64 secret_id;
-	__le32 blob_len;
-	char blob[];
-} __attribute__ ((packed));
-
-
-/* common request/reply headers */
-struct ceph_x_request_header {
-	__le16 op;
-} __attribute__ ((packed));
-
-struct ceph_x_reply_header {
-	__le16 op;
-	__le32 result;
-} __attribute__ ((packed));
-
-
-/* authenticate handshake */
-
-/* initial hello (no reply header) */
-struct ceph_x_server_challenge {
-	__u8 struct_v;
-	__le64 server_challenge;
-} __attribute__ ((packed));
-
-struct ceph_x_authenticate {
-	__u8 struct_v;
-	__le64 client_challenge;
-	__le64 key;
-	/* ticket blob */
-} __attribute__ ((packed));
-
-struct ceph_x_service_ticket_request {
-	__u8 struct_v;
-	__le32 keys;
-} __attribute__ ((packed));
-
-struct ceph_x_challenge_blob {
-	__le64 server_challenge;
-	__le64 client_challenge;
-} __attribute__ ((packed));
-
-
-
-/* authorize handshake */
-
-/*
- * The authorizer consists of two pieces:
- *  a - service id, ticket blob
- *  b - encrypted with session key
- */
-struct ceph_x_authorize_a {
-	__u8 struct_v;
-	__le64 global_id;
-	__le32 service_id;
-	struct ceph_x_ticket_blob ticket_blob;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_b {
-	__u8 struct_v;
-	__le64 nonce;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_reply {
-	__u8 struct_v;
-	__le64 nonce_plus_one;
-} __attribute__ ((packed));
-
-
-/*
- * encyption bundle
- */
-#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
-
-struct ceph_x_encrypt_header {
-	__u8 struct_v;
-	__le64 magic;
-} __attribute__ ((packed));
-
-#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-
-#include "buffer.h"
-#include "decode.h"
-
-struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
-{
-	struct ceph_buffer *b;
-
-	b = kmalloc(sizeof(*b), gfp);
-	if (!b)
-		return NULL;
-
-	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-	if (b->vec.iov_base) {
-		b->is_vmalloc = false;
-	} else {
-		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-		if (!b->vec.iov_base) {
-			kfree(b);
-			return NULL;
-		}
-		b->is_vmalloc = true;
-	}
-
-	kref_init(&b->kref);
-	b->alloc_len = len;
-	b->vec.iov_len = len;
-	dout("buffer_new %p\n", b);
-	return b;
-}
-
-void ceph_buffer_release(struct kref *kref)
-{
-	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
-
-	dout("buffer_release %p\n", b);
-	if (b->vec.iov_base) {
-		if (b->is_vmalloc)
-			vfree(b->vec.iov_base);
-		else
-			kfree(b->vec.iov_base);
-	}
-	kfree(b);
-}
-
-int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
-{
-	size_t len;
-
-	ceph_decode_need(p, end, sizeof(u32), bad);
-	len = ceph_decode_32(p);
-	dout("decode_buffer len %d\n", (int)len);
-	ceph_decode_need(p, end, len, bad);
-	*b = ceph_buffer_new(len, GFP_NOFS);
-	if (!*b)
-		return -ENOMEM;
-	ceph_decode_copy(p, (*b)->vec.iov_base, len);
-	return 0;
-bad:
-	return -EINVAL;
-}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-
-#include <linux/kref.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
-	struct kref kref;
-	struct kvec vec;
-	size_t alloc_len;
-	bool is_vmalloc;
-};
-
-extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
-extern void ceph_buffer_release(struct kref *kref);
-
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
-	kref_get(&b->kref);
-	return b;
-}
-
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
-	kref_put(&b->kref, ceph_buffer_release);
-}
-
-extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
-
-#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a151..3cff67cbb9c0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -9,8 +9,9 @@
 #include <linux/writeback.h>
 
 #include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
 
 /*
  * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
 			     int *total, int *avail, int *used, int *reserved,
 			     int *min)
 {
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 
 	if (total)
 		*total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	struct ceph_mount_args *ma = mdsc->client->mount_args;
+	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
 
 	ci->i_hold_caps_min = round_jiffies(jiffies +
 					    ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
 		 unsigned seq, unsigned mseq, u64 realmino, int flags,
 		 struct ceph_cap_reservation *caps_reservation)
 {
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap *new_cap = NULL;
 	struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	int removed = 0;
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
 	int mds;
 	struct ceph_cap_snap *capsnap;
 	u32 mseq;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
 						    session->s_mutex */
 	u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	int was = ci->i_dirty_caps;
 	int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing;
 
@@ -1462,8 +1463,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		     struct ceph_mds_session *session)
 {
-	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
 	int file_wanted, used;
@@ -1706,7 +1707,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 			  unsigned *flush_tid)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
@@ -1872,7 +1873,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 				       caps_are_flushed(inode, flush_tid));
 	} else {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&inode->i_lock);
 		if (__ceph_caps_dirty(ci))
@@ -2465,7 +2466,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	__releases(inode->i_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
@@ -2713,7 +2714,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-#  define dout(fmt, ...)						\
-	pr_debug(" %12.12s:%-4d : " fmt,				\
-		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
-		 __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-#  define dout(fmt, ...)	do {				\
-		if (0)						\
-			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
-	} while (0)
-# endif
-
-#else
-
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
-
-#endif
-
-#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
 /*
  * Ceph 'frag' type
  */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 
 int ceph_frag_compare(__u32 a, __u32 b)
 {
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef FS_CEPH_FRAG_H
-#define FS_CEPH_FRAG_H
-
-/*
- * "Frags" are a way to describe a subset of a 32-bit number space,
- * using a mask and a value to match against that mask.  Any given frag
- * (subset of the number space) can be partitioned into 2^n sub-frags.
- *
- * Frags are encoded into a 32-bit word:
- *   8 upper bits = "bits"
- *  24 lower bits = "value"
- * (We could go to 5+27 bits, but who cares.)
- *
- * We use the _most_ significant bits of the 24 bit value.  This makes
- * values logically sort.
- *
- * Unfortunately, because the "bits" field is still in the high bits, we
- * can't sort encoded frags numerically.  However, it does allow you
- * to feed encoded frags as values into frag_contains_value.
- */
-static inline __u32 ceph_frag_make(__u32 b, __u32 v)
-{
-	return (b << 24) |
-		(v & (0xffffffu << (24-b)) & 0xffffffu);
-}
-static inline __u32 ceph_frag_bits(__u32 f)
-{
-	return f >> 24;
-}
-static inline __u32 ceph_frag_value(__u32 f)
-{
-	return f & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask(__u32 f)
-{
-	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask_shift(__u32 f)
-{
-	return 24 - ceph_frag_bits(f);
-}
-
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
-{
-	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-	/* is sub as specific as us, and contained by us? */
-	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f) - 1,
-			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-	return ceph_frag_bits(f) > 0 &&
-		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-	return ceph_frag_bits(f) > 0 &&
-		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f),
-		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f)+1,
-	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
-static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
-{
-	int newbits = ceph_frag_bits(f) + by;
-	return ceph_frag_make(newbits,
-			 ceph_frag_value(f) | (i << (24 - newbits)));
-}
-static inline int ceph_frag_is_leftmost(__u32 f)
-{
-	return ceph_frag_value(f) == 0;
-}
-static inline int ceph_frag_is_rightmost(__u32 f)
-{
-	return ceph_frag_value(f) == ceph_frag_mask(f);
-}
-static inline __u32 ceph_frag_next(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f),
-			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
-}
-
-/*
- * comparator to sort frags logically, as when traversing the
- * number space in ascending order...
- */
-int ceph_frag_compare(__u32 a, __u32 b);
-
-#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Some non-inline ceph helpers
- */
-#include "types.h"
-
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
-	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
-	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
-	__u32 os = le32_to_cpu(layout->fl_object_size);
-
-	/* stripe unit, object size must be non-zero, 64k increment */
-	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
-		return 0;
-	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
-		return 0;
-	/* object size must be a multiple of stripe unit */
-	if (os < su || os % su)
-		return 0;
-	/* stripe count must be non-zero */
-	if (!sc)
-		return 0;
-	return 1;
-}
-
-
-int ceph_flags_to_mode(int flags)
-{
-	int mode;
-
-#ifdef O_DIRECTORY  /* fixme */
-	if ((flags & O_DIRECTORY) == O_DIRECTORY)
-		return CEPH_FILE_MODE_PIN;
-#endif
-	if ((flags & O_APPEND) == O_APPEND)
-		flags |= O_WRONLY;
-
-	if ((flags & O_ACCMODE) == O_RDWR)
-		mode = CEPH_FILE_MODE_RDWR;
-	else if ((flags & O_ACCMODE) == O_WRONLY)
-		mode = CEPH_FILE_MODE_WR;
-	else
-		mode = CEPH_FILE_MODE_RD;
-
-#ifdef O_LAZY
-	if (flags & O_LAZY)
-		mode |= CEPH_FILE_MODE_LAZY;
-#endif
-
-	return mode;
-}
-
-int ceph_caps_for_mode(int mode)
-{
-	int caps = CEPH_CAP_PIN;
-
-	if (mode & CEPH_FILE_MODE_RD)
-		caps |= CEPH_CAP_FILE_SHARED |
-			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-	if (mode & CEPH_FILE_MODE_WR)
-		caps |= CEPH_CAP_FILE_EXCL |
-			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-	if (mode & CEPH_FILE_MODE_LAZY)
-		caps |= CEPH_CAP_FILE_LAZYIO;
-
-	return caps;
-}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * ceph_fs.h - Ceph constants and data types to share between kernel and
- * user space.
- *
- * Most types in this file are defined as little-endian, and are
- * primarily intended to describe data structures that pass over the
- * wire or that are stored on disk.
- *
- * LGPL2
- */
-
-#ifndef CEPH_FS_H
-#define CEPH_FS_H
-
-#include "msgr.h"
-#include "rados.h"
-
-/*
- * subprotocol versions.  when specific messages types or high-level
- * protocols change, bump the affected components.  we keep rev
- * internal cluster protocols separately from the public,
- * client-facing protocol.
- */
-#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
-#define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   24 /* server/client */
-#define CEPH_MDSC_PROTOCOL   32 /* server/client */
-#define CEPH_MONC_PROTOCOL   15 /* server/client */
-
-
-#define CEPH_INO_ROOT  1
-#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
-
-/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
-#define CEPH_MAX_MON   31
-
-
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
-#define CEPH_FEATURE_FLOCK          (1<<3)
-
-
-/*
- * ceph_file_layout - describe data layout for a file/inode
- */
-struct ceph_file_layout {
-	/* file -> object mapping */
-	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
-				      of page size. */
-	__le32 fl_stripe_count;    /* over this many objects */
-	__le32 fl_object_size;     /* until objects are this big, then move to
-				      new objects */
-	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
-
-	/* pg -> disk layout */
-	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
-
-	/* object -> pg layout */
-	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
-	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
-} __attribute__ ((packed));
-
-#define CEPH_MIN_STRIPE_UNIT 65536
-
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
-
-/* crypto algorithms */
-#define CEPH_CRYPTO_NONE 0x0
-#define CEPH_CRYPTO_AES  0x1
-
-#define CEPH_AES_IV "cephsageyudagreg"
-
-/* security/authentication protocols */
-#define CEPH_AUTH_UNKNOWN	0x0
-#define CEPH_AUTH_NONE	 	0x1
-#define CEPH_AUTH_CEPHX	 	0x2
-
-#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
-
-
-/*********************************************
- * message layer
- */
-
-/*
- * message types
- */
-
-/* misc */
-#define CEPH_MSG_SHUTDOWN               1
-#define CEPH_MSG_PING                   2
-
-/* client <-> monitor */
-#define CEPH_MSG_MON_MAP                4
-#define CEPH_MSG_MON_GET_MAP            5
-#define CEPH_MSG_STATFS                 13
-#define CEPH_MSG_STATFS_REPLY           14
-#define CEPH_MSG_MON_SUBSCRIBE          15
-#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
-#define CEPH_MSG_AUTH			17
-#define CEPH_MSG_AUTH_REPLY		18
-
-/* client <-> mds */
-#define CEPH_MSG_MDS_MAP                21
-
-#define CEPH_MSG_CLIENT_SESSION         22
-#define CEPH_MSG_CLIENT_RECONNECT       23
-
-#define CEPH_MSG_CLIENT_REQUEST         24
-#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
-#define CEPH_MSG_CLIENT_REPLY           26
-#define CEPH_MSG_CLIENT_CAPS            0x310
-#define CEPH_MSG_CLIENT_LEASE           0x311
-#define CEPH_MSG_CLIENT_SNAP            0x312
-#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
-
-/* pool ops */
-#define CEPH_MSG_POOLOP_REPLY           48
-#define CEPH_MSG_POOLOP                 49
-
-
-/* osd */
-#define CEPH_MSG_OSD_MAP          41
-#define CEPH_MSG_OSD_OP           42
-#define CEPH_MSG_OSD_OPREPLY      43
-
-/* pool operations */
-enum {
-  POOL_OP_CREATE			= 0x01,
-  POOL_OP_DELETE			= 0x02,
-  POOL_OP_AUID_CHANGE			= 0x03,
-  POOL_OP_CREATE_SNAP			= 0x11,
-  POOL_OP_DELETE_SNAP			= 0x12,
-  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
-  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
-};
-
-struct ceph_mon_request_header {
-	__le64 have_version;
-	__le16 session_mon;
-	__le64 session_mon_tid;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_statfs {
-	__le64 kb, kb_used, kb_avail;
-	__le64 num_objects;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs_reply {
-	struct ceph_fsid fsid;
-	__le64 version;
-	struct ceph_statfs st;
-} __attribute__ ((packed));
-
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 pool;
-	__le32 op;
-	__le64 auid;
-	__le64 snapid;
-	__le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 reply_code;
-	__le32 epoch;
-	char has_data;
-	char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
-	__le64 snapid;
-} __attribute__ ((packed));
-
-struct ceph_osd_getmap {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 start;
-} __attribute__ ((packed));
-
-struct ceph_mds_getmap {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_client_mount {
-	struct ceph_mon_request_header monhdr;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_item {
-	__le64 have_version;	__le64 have;
-	__u8 onetime;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_ack {
-	__le32 duration;         /* seconds */
-	struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-/*
- * mds states
- *   > 0 -> in
- *  <= 0 -> out
- */
-#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
-#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
-					  empty log. */
-#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
-#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
-#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
-#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
-#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
-
-#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
-#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
-					  operations (import, rename, etc.) */
-#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
-#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
-#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
-#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
-#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
-
-extern const char *ceph_mds_state_name(int s);
-
-
-/*
- * metadata lock types.
- *  - these are bitmasks.. we can compose them
- *  - they also define the lock ordering by the MDS
- *  - a few of these are internal to the mds
- */
-#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_IXATTR      2048
-#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
-#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
-
-/* client_session ops */
-enum {
-	CEPH_SESSION_REQUEST_OPEN,
-	CEPH_SESSION_OPEN,
-	CEPH_SESSION_REQUEST_CLOSE,
-	CEPH_SESSION_CLOSE,
-	CEPH_SESSION_REQUEST_RENEWCAPS,
-	CEPH_SESSION_RENEWCAPS,
-	CEPH_SESSION_STALE,
-	CEPH_SESSION_RECALL_STATE,
-};
-
-extern const char *ceph_session_op_name(int op);
-
-struct ceph_mds_session_head {
-	__le32 op;
-	__le64 seq;
-	struct ceph_timespec stamp;
-	__le32 max_caps, max_leases;
-} __attribute__ ((packed));
-
-/* client_request */
-/*
- * metadata ops.
- *  & 0x001000 -> write op
- *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
- &  & 0x100000 -> use weird ino/path trace
- */
-#define CEPH_MDS_OP_WRITE        0x001000
-enum {
-	CEPH_MDS_OP_LOOKUP     = 0x00100,
-	CEPH_MDS_OP_GETATTR    = 0x00101,
-	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
-	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
-
-	CEPH_MDS_OP_SETXATTR   = 0x01105,
-	CEPH_MDS_OP_RMXATTR    = 0x01106,
-	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
-	CEPH_MDS_OP_SETATTR    = 0x01108,
-	CEPH_MDS_OP_SETFILELOCK= 0x01109,
-	CEPH_MDS_OP_GETFILELOCK= 0x00110,
-
-	CEPH_MDS_OP_MKNOD      = 0x01201,
-	CEPH_MDS_OP_LINK       = 0x01202,
-	CEPH_MDS_OP_UNLINK     = 0x01203,
-	CEPH_MDS_OP_RENAME     = 0x01204,
-	CEPH_MDS_OP_MKDIR      = 0x01220,
-	CEPH_MDS_OP_RMDIR      = 0x01221,
-	CEPH_MDS_OP_SYMLINK    = 0x01222,
-
-	CEPH_MDS_OP_CREATE     = 0x01301,
-	CEPH_MDS_OP_OPEN       = 0x00302,
-	CEPH_MDS_OP_READDIR    = 0x00305,
-
-	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
-	CEPH_MDS_OP_MKSNAP     = 0x01400,
-	CEPH_MDS_OP_RMSNAP     = 0x01401,
-	CEPH_MDS_OP_LSSNAP     = 0x00402,
-};
-
-extern const char *ceph_mds_op_name(int op);
-
-
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
-
-union ceph_mds_request_args {
-	struct {
-		__le32 mask;                 /* CEPH_CAP_* */
-	} __attribute__ ((packed)) getattr;
-	struct {
-		__le32 mode;
-		__le32 uid;
-		__le32 gid;
-		struct ceph_timespec mtime;
-		struct ceph_timespec atime;
-		__le64 size, old_size;       /* old_size needed by truncate */
-		__le32 mask;                 /* CEPH_SETATTR_* */
-	} __attribute__ ((packed)) setattr;
-	struct {
-		__le32 frag;                 /* which dir fragment */
-		__le32 max_entries;          /* how many dentries to grab */
-		__le32 max_bytes;
-	} __attribute__ ((packed)) readdir;
-	struct {
-		__le32 mode;
-		__le32 rdev;
-	} __attribute__ ((packed)) mknod;
-	struct {
-		__le32 mode;
-	} __attribute__ ((packed)) mkdir;
-	struct {
-		__le32 flags;
-		__le32 mode;
-		__le32 stripe_unit;          /* layout for newly created file */
-		__le32 stripe_count;         /* ... */
-		__le32 object_size;
-		__le32 file_replication;
-		__le32 preferred;
-	} __attribute__ ((packed)) open;
-	struct {
-		__le32 flags;
-	} __attribute__ ((packed)) setxattr;
-	struct {
-		struct ceph_file_layout layout;
-	} __attribute__ ((packed)) setlayout;
-	struct {
-		__u8 rule; /* currently fcntl or flock */
-		__u8 type; /* shared, exclusive, remove*/
-		__le64 pid; /* process id requesting the lock */
-		__le64 pid_namespace;
-		__le64 start; /* initial location to lock */
-		__le64 length; /* num bytes to lock from start */
-		__u8 wait; /* will caller wait for lock to become available? */
-	} __attribute__ ((packed)) filelock_change;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
-
-struct ceph_mds_request_head {
-	__le64 oldest_client_tid;
-	__le32 mdsmap_epoch;           /* on client */
-	__le32 flags;                  /* CEPH_MDS_FLAG_* */
-	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
-	__le16 num_releases;           /* # include cap/lease release records */
-	__le32 op;                     /* mds op code */
-	__le32 caller_uid, caller_gid;
-	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
-					  etc. (if replaying) */
-	union ceph_mds_request_args args;
-} __attribute__ ((packed));
-
-/* cap/lease release record */
-struct ceph_mds_request_release {
-	__le64 ino, cap_id;            /* ino and unique cap id */
-	__le32 caps, wanted;           /* new issued, wanted */
-	__le32 seq, issue_seq, mseq;
-	__le32 dname_seq;              /* if releasing a dentry lease, a */
-	__le32 dname_len;              /* string follows. */
-} __attribute__ ((packed));
-
-/* client reply */
-struct ceph_mds_reply_head {
-	__le32 op;
-	__le32 result;
-	__le32 mdsmap_epoch;
-	__u8 safe;                     /* true if committed to disk */
-	__u8 is_dentry, is_target;     /* true if dentry, target inode records
-					  are included with reply */
-} __attribute__ ((packed));
-
-/* one for each node split */
-struct ceph_frag_tree_split {
-	__le32 frag;                   /* this frag splits... */
-	__le32 by;                     /* ...by this many bits */
-} __attribute__ ((packed));
-
-struct ceph_frag_tree_head {
-	__le32 nsplits;                /* num ceph_frag_tree_split records */
-	struct ceph_frag_tree_split splits[];
-} __attribute__ ((packed));
-
-/* capability issue, for bundling with mds reply */
-struct ceph_mds_reply_cap {
-	__le32 caps, wanted;           /* caps issued, wanted */
-	__le64 cap_id;
-	__le32 seq, mseq;
-	__le64 realm;                  /* snap realm */
-	__u8 flags;                    /* CEPH_CAP_FLAG_* */
-} __attribute__ ((packed));
-
-#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
-
-/* inode record, for bundling with mds reply */
-struct ceph_mds_reply_inode {
-	__le64 ino;
-	__le64 snapid;
-	__le32 rdev;
-	__le64 version;                /* inode version */
-	__le64 xattr_version;          /* version for xattr blob */
-	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-	struct ceph_file_layout layout;
-	struct ceph_timespec ctime, mtime, atime;
-	__le32 time_warp_seq;
-	__le64 size, max_size, truncate_size;
-	__le32 truncate_seq;
-	__le32 mode, uid, gid;
-	__le32 nlink;
-	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
-	struct ceph_timespec rctime;
-	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
-} __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
-
-/* reply_lease follows dname, and reply_inode */
-struct ceph_mds_reply_lease {
-	__le16 mask;            /* lease type(s) */
-	__le32 duration_ms;     /* lease duration */
-	__le32 seq;
-} __attribute__ ((packed));
-
-struct ceph_mds_reply_dirfrag {
-	__le32 frag;            /* fragment */
-	__le32 auth;            /* auth mds, if this is a delegation point */
-	__le32 ndist;           /* number of mds' this is replicated on */
-	__le32 dist[];
-} __attribute__ ((packed));
-
-#define CEPH_LOCK_FCNTL    1
-#define CEPH_LOCK_FLOCK    2
-
-#define CEPH_LOCK_SHARED   1
-#define CEPH_LOCK_EXCL     2
-#define CEPH_LOCK_UNLOCK   4
-
-struct ceph_filelock {
-	__le64 start;/* file offset to start lock at */
-	__le64 length; /* num bytes to lock; 0 for all following start */
-	__le64 client; /* which client holds the lock */
-	__le64 pid; /* process id holding the lock on the client */
-	__le64 pid_namespace;
-	__u8 type; /* shared lock, exclusive lock, or unlock */
-} __attribute__ ((packed));
-
-
-/* file access modes */
-#define CEPH_FILE_MODE_PIN        0
-#define CEPH_FILE_MODE_RD         1
-#define CEPH_FILE_MODE_WR         2
-#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
-#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
-#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
-
-int ceph_flags_to_mode(int flags);
-
-
-/* capability bits */
-#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
-
-/* generic cap bits */
-#define CEPH_CAP_GSHARED     1  /* client can reads */
-#define CEPH_CAP_GEXCL       2  /* client can read and update */
-#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
-#define CEPH_CAP_GRD         8  /* (file) client can read */
-#define CEPH_CAP_GWR        16  /* (file) client can write */
-#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
-#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
-#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
-
-/* per-lock shift */
-#define CEPH_CAP_SAUTH      2
-#define CEPH_CAP_SLINK      4
-#define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8
-#define CEPH_CAP_SFLOCK    20 
-
-#define CEPH_CAP_BITS       22
-
-/* composed values */
-#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
-#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
-#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
-#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
-#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
-#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
-#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
-#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
-
-
-/* cap masks (for getattr) */
-#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
-#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
-#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
-#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
-#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
-#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
-#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
-				 CEPH_CAP_AUTH_SHARED |	\
-				 CEPH_CAP_LINK_SHARED |	\
-				 CEPH_CAP_FILE_SHARED |	\
-				 CEPH_CAP_XATTR_SHARED)
-
-#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
-			      CEPH_CAP_LINK_SHARED |			\
-			      CEPH_CAP_XATTR_SHARED |			\
-			      CEPH_CAP_FILE_SHARED)
-#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
-			   CEPH_CAP_FILE_CACHE)
-
-#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
-			   CEPH_CAP_LINK_EXCL |		\
-			   CEPH_CAP_XATTR_EXCL |	\
-			   CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
-			      CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
-			   CEPH_CAP_PIN)
-
-#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
-			CEPH_LOCK_IXATTR)
-
-int ceph_caps_for_mode(int mode);
-
-enum {
-	CEPH_CAP_OP_GRANT,         /* mds->client grant */
-	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
-	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
-	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
-	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
-	CEPH_CAP_OP_UPDATE,        /* client->mds update */
-	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
-	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
-	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
-	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
-	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
-	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
-	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
-};
-
-extern const char *ceph_cap_op_name(int op);
-
-/*
- * caps message, used for capability callbacks, acks, requests, etc.
- */
-struct ceph_mds_caps {
-	__le32 op;                  /* CEPH_CAP_OP_* */
-	__le64 ino, realm;
-	__le64 cap_id;
-	__le32 seq, issue_seq;
-	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
-	__le32 migrate_seq;
-	__le64 snap_follows;
-	__le32 snap_trace_len;
-
-	/* authlock */
-	__le32 uid, gid, mode;
-
-	/* linklock */
-	__le32 nlink;
-
-	/* xattrlock */
-	__le32 xattr_len;
-	__le64 xattr_version;
-
-	/* filelock */
-	__le64 size, max_size, truncate_size;
-	__le32 truncate_seq;
-	struct ceph_timespec mtime, atime, ctime;
-	struct ceph_file_layout layout;
-	__le32 time_warp_seq;
-} __attribute__ ((packed));
-
-/* cap release msg head */
-struct ceph_mds_cap_release {
-	__le32 num;                /* number of cap_items that follow */
-} __attribute__ ((packed));
-
-struct ceph_mds_cap_item {
-	__le64 ino;
-	__le64 cap_id;
-	__le32 migrate_seq, seq;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
-#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
-#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
-#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
-
-extern const char *ceph_lease_op_name(int o);
-
-/* lease msg header */
-struct ceph_mds_lease {
-	__u8 action;            /* CEPH_MDS_LEASE_* */
-	__le16 mask;            /* which lease */
-	__le64 ino;
-	__le64 first, last;     /* snap range */
-	__le32 seq;
-	__le32 duration_ms;     /* duration of renewal */
-} __attribute__ ((packed));
-/* followed by a __le32+string for dname */
-
-/* client reconnect */
-struct ceph_mds_cap_reconnect {
-	__le64 cap_id;
-	__le32 wanted;
-	__le32 issued;
-	__le64 snaprealm;
-	__le64 pathbase;        /* base ino for our path to this ino */
-	__le32 flock_len;       /* size of flock state blob, if any */
-} __attribute__ ((packed));
-/* followed by flock blob */
-
-struct ceph_mds_cap_reconnect_v1 {
-	__le64 cap_id;
-	__le32 wanted;
-	__le32 issued;
-	__le64 size;
-	struct ceph_timespec mtime, atime;
-	__le64 snaprealm;
-	__le64 pathbase;        /* base ino for our path to this ino */
-} __attribute__ ((packed));
-
-struct ceph_mds_snaprealm_reconnect {
-	__le64 ino;     /* snap realm base */
-	__le64 seq;     /* snap seq for this snap realm */
-	__le64 parent;  /* parent realm */
-} __attribute__ ((packed));
-
-/*
- * snaps
- */
-enum {
-	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
-	CEPH_SNAP_OP_CREATE,
-	CEPH_SNAP_OP_DESTROY,
-	CEPH_SNAP_OP_SPLIT,
-};
-
-extern const char *ceph_snap_op_name(int o);
-
-/* snap msg header */
-struct ceph_mds_snap_head {
-	__le32 op;                /* CEPH_SNAP_OP_* */
-	__le64 split;             /* ino to split off, if any */
-	__le32 num_split_inos;    /* # inos belonging to new child realm */
-	__le32 num_split_realms;  /* # child realms udner new child realm */
-	__le32 trace_len;         /* size of snap trace blob */
-} __attribute__ ((packed));
-/* followed by split ino list, then split realms, then the trace blob */
-
-/*
- * encode info about a snaprealm, as viewed by a client
- */
-struct ceph_mds_snap_realm {
-	__le64 ino;           /* ino */
-	__le64 created;       /* snap: when created */
-	__le64 parent;        /* ino: parent realm */
-	__le64 parent_since;  /* snap: same parent since */
-	__le64 seq;           /* snap: version */
-	__le32 num_snaps;
-	__le32 num_prior_parent_snaps;
-} __attribute__ ((packed));
-/* followed by my snap list, then prior parent snap list */
-
-#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
-
-#include "types.h"
-
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c)						\
-	do {							\
-		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
-		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
-		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
-	} while (0)
-
-unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
-{
-	const unsigned char *k = (const unsigned char *)str;
-	__u32 a, b, c;  /* the internal state */
-	__u32 len;      /* how many key bytes still need mixing */
-
-	/* Set up the internal state */
-	len = length;
-	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
-	b = a;
-	c = 0;               /* variable initialization of internal state */
-
-	/* handle most of the key */
-	while (len >= 12) {
-		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
-			 ((__u32)k[3] << 24));
-		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
-			 ((__u32)k[7] << 24));
-		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
-			 ((__u32)k[11] << 24));
-		mix(a, b, c);
-		k = k + 12;
-		len = len - 12;
-	}
-
-	/* handle the last 11 bytes */
-	c = c + length;
-	switch (len) {            /* all the case statements fall through */
-	case 11:
-		c = c + ((__u32)k[10] << 24);
-	case 10:
-		c = c + ((__u32)k[9] << 16);
-	case 9:
-		c = c + ((__u32)k[8] << 8);
-		/* the first byte of c is reserved for the length */
-	case 8:
-		b = b + ((__u32)k[7] << 24);
-	case 7:
-		b = b + ((__u32)k[6] << 16);
-	case 6:
-		b = b + ((__u32)k[5] << 8);
-	case 5:
-		b = b + k[4];
-	case 4:
-		a = a + ((__u32)k[3] << 24);
-	case 3:
-		a = a + ((__u32)k[2] << 16);
-	case 2:
-		a = a + ((__u32)k[1] << 8);
-	case 1:
-		a = a + k[0];
-		/* case 0: nothing left to add */
-	}
-	mix(a, b, c);
-
-	return c;
-}
-
-/*
- * linux dcache hash
- */
-unsigned ceph_str_hash_linux(const char *str, unsigned length)
-{
-	unsigned long hash = 0;
-	unsigned char c;
-
-	while (length--) {
-		c = *str++;
-		hash = (hash + (c << 4) + (c >> 4)) * 11;
-	}
-	return hash;
-}
-
-
-unsigned ceph_str_hash(int type, const char *s, unsigned len)
-{
-	switch (type) {
-	case CEPH_STR_HASH_LINUX:
-		return ceph_str_hash_linux(s, len);
-	case CEPH_STR_HASH_RJENKINS:
-		return ceph_str_hash_rjenkins(s, len);
-	default:
-		return -1;
-	}
-}
-
-const char *ceph_str_hash_name(int type)
-{
-	switch (type) {
-	case CEPH_STR_HASH_LINUX:
-		return "linux";
-	case CEPH_STR_HASH_RJENKINS:
-		return "rjenkins";
-	default:
-		return "unknown";
-	}
-}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef FS_CEPH_HASH_H
-#define FS_CEPH_HASH_H
-
-#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
-#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
-
-extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
-extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
-
-extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
-extern const char *ceph_str_hash_name(int type);
-
-#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
deleted file mode 100644
index c6179d3a26a2..000000000000
--- a/fs/ceph/ceph_strings.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Ceph string constants
- */
-#include "types.h"
-
-const char *ceph_entity_type_name(int type)
-{
-	switch (type) {
-	case CEPH_ENTITY_TYPE_MDS: return "mds";
-	case CEPH_ENTITY_TYPE_OSD: return "osd";
-	case CEPH_ENTITY_TYPE_MON: return "mon";
-	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_AUTH: return "auth";
-	default: return "unknown";
-	}
-}
-
-const char *ceph_osd_op_name(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_READ: return "read";
-	case CEPH_OSD_OP_STAT: return "stat";
-
-	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
-	case CEPH_OSD_OP_WRITE: return "write";
-	case CEPH_OSD_OP_DELETE: return "delete";
-	case CEPH_OSD_OP_TRUNCATE: return "truncate";
-	case CEPH_OSD_OP_ZERO: return "zero";
-	case CEPH_OSD_OP_WRITEFULL: return "writefull";
-	case CEPH_OSD_OP_ROLLBACK: return "rollback";
-
-	case CEPH_OSD_OP_APPEND: return "append";
-	case CEPH_OSD_OP_STARTSYNC: return "startsync";
-	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
-	case CEPH_OSD_OP_TMAPUP: return "tmapup";
-	case CEPH_OSD_OP_TMAPGET: return "tmapget";
-	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
-	case CEPH_OSD_OP_GETXATTR: return "getxattr";
-	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-	case CEPH_OSD_OP_SETXATTR: return "setxattr";
-	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
-	case CEPH_OSD_OP_PULL: return "pull";
-	case CEPH_OSD_OP_PUSH: return "push";
-	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-	case CEPH_OSD_OP_SCRUB: return "scrub";
-
-	case CEPH_OSD_OP_WRLOCK: return "wrlock";
-	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-	case CEPH_OSD_OP_RDLOCK: return "rdlock";
-	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-	case CEPH_OSD_OP_UPLOCK: return "uplock";
-	case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
-	case CEPH_OSD_OP_CALL: return "call";
-
-	case CEPH_OSD_OP_PGLS: return "pgls";
-	}
-	return "???";
-}
-
-const char *ceph_mds_state_name(int s)
-{
-	switch (s) {
-		/* down and out */
-	case CEPH_MDS_STATE_DNE:        return "down:dne";
-	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
-		/* up and out */
-	case CEPH_MDS_STATE_BOOT:       return "up:boot";
-	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
-	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
-	case CEPH_MDS_STATE_CREATING:   return "up:creating";
-	case CEPH_MDS_STATE_STARTING:   return "up:starting";
-		/* up and in */
-	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
-	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
-	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
-	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
-	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
-	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
-	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
-	}
-	return "???";
-}
-
-const char *ceph_session_op_name(int op)
-{
-	switch (op) {
-	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
-	case CEPH_SESSION_OPEN: return "open";
-	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
-	case CEPH_SESSION_CLOSE: return "close";
-	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
-	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
-	case CEPH_SESSION_STALE: return "stale";
-	case CEPH_SESSION_RECALL_STATE: return "recall_state";
-	}
-	return "???";
-}
-
-const char *ceph_mds_op_name(int op)
-{
-	switch (op) {
-	case CEPH_MDS_OP_LOOKUP:  return "lookup";
-	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
-	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
-	case CEPH_MDS_OP_GETATTR:  return "getattr";
-	case CEPH_MDS_OP_SETXATTR: return "setxattr";
-	case CEPH_MDS_OP_SETATTR: return "setattr";
-	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
-	case CEPH_MDS_OP_READDIR: return "readdir";
-	case CEPH_MDS_OP_MKNOD: return "mknod";
-	case CEPH_MDS_OP_LINK: return "link";
-	case CEPH_MDS_OP_UNLINK: return "unlink";
-	case CEPH_MDS_OP_RENAME: return "rename";
-	case CEPH_MDS_OP_MKDIR: return "mkdir";
-	case CEPH_MDS_OP_RMDIR: return "rmdir";
-	case CEPH_MDS_OP_SYMLINK: return "symlink";
-	case CEPH_MDS_OP_CREATE: return "create";
-	case CEPH_MDS_OP_OPEN: return "open";
-	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
-	case CEPH_MDS_OP_LSSNAP: return "lssnap";
-	case CEPH_MDS_OP_MKSNAP: return "mksnap";
-	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
-	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
-	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
-	}
-	return "???";
-}
-
-const char *ceph_cap_op_name(int op)
-{
-	switch (op) {
-	case CEPH_CAP_OP_GRANT: return "grant";
-	case CEPH_CAP_OP_REVOKE: return "revoke";
-	case CEPH_CAP_OP_TRUNC: return "trunc";
-	case CEPH_CAP_OP_EXPORT: return "export";
-	case CEPH_CAP_OP_IMPORT: return "import";
-	case CEPH_CAP_OP_UPDATE: return "update";
-	case CEPH_CAP_OP_DROP: return "drop";
-	case CEPH_CAP_OP_FLUSH: return "flush";
-	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
-	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
-	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
-	case CEPH_CAP_OP_RELEASE: return "release";
-	case CEPH_CAP_OP_RENEW: return "renew";
-	}
-	return "???";
-}
-
-const char *ceph_lease_op_name(int o)
-{
-	switch (o) {
-	case CEPH_MDS_LEASE_REVOKE: return "revoke";
-	case CEPH_MDS_LEASE_RELEASE: return "release";
-	case CEPH_MDS_LEASE_RENEW: return "renew";
-	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
-	}
-	return "???";
-}
-
-const char *ceph_snap_op_name(int o)
-{
-	switch (o) {
-	case CEPH_SNAP_OP_UPDATE: return "update";
-	case CEPH_SNAP_OP_CREATE: return "create";
-	case CEPH_SNAP_OP_DESTROY: return "destroy";
-	case CEPH_SNAP_OP_SPLIT: return "split";
-	}
-	return "???";
-}
-
-const char *ceph_pool_op_name(int op)
-{
-	switch (op) {
-	case POOL_OP_CREATE: return "create";
-	case POOL_OP_DELETE: return "delete";
-	case POOL_OP_AUID_CHANGE: return "auid change";
-	case POOL_OP_CREATE_SNAP: return "create snap";
-	case POOL_OP_DELETE_SNAP: return "delete snap";
-	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-	}
-	return "???";
-}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/slab.h>
-#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-#endif
-
-#include "crush.h"
-
-const char *crush_bucket_alg_name(int alg)
-{
-	switch (alg) {
-	case CRUSH_BUCKET_UNIFORM: return "uniform";
-	case CRUSH_BUCKET_LIST: return "list";
-	case CRUSH_BUCKET_TREE: return "tree";
-	case CRUSH_BUCKET_STRAW: return "straw";
-	default: return "unknown";
-	}
-}
-
-/**
- * crush_get_bucket_item_weight - Get weight of an item in given bucket
- * @b: bucket pointer
- * @p: item index in bucket
- */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
-{
-	if (p >= b->size)
-		return 0;
-
-	switch (b->alg) {
-	case CRUSH_BUCKET_UNIFORM:
-		return ((struct crush_bucket_uniform *)b)->item_weight;
-	case CRUSH_BUCKET_LIST:
-		return ((struct crush_bucket_list *)b)->item_weights[p];
-	case CRUSH_BUCKET_TREE:
-		if (p & 1)
-			return ((struct crush_bucket_tree *)b)->node_weights[p];
-		return 0;
-	case CRUSH_BUCKET_STRAW:
-		return ((struct crush_bucket_straw *)b)->item_weights[p];
-	}
-	return 0;
-}
-
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
-	int i, b, c;
-
-	for (b = 0; b < map->max_buckets; b++) {
-		if (map->buckets[b] == NULL)
-			continue;
-		for (i = 0; i < map->buckets[b]->size; i++) {
-			c = map->buckets[b]->items[i];
-			BUG_ON(c >= map->max_devices ||
-			       c < -map->max_buckets);
-			if (c >= 0)
-				map->device_parents[c] = map->buckets[b]->id;
-			else
-				map->bucket_parents[-1-c] = map->buckets[b]->id;
-		}
-	}
-}
-
-void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
-{
-	kfree(b->h.perm);
-	kfree(b->h.items);
-	kfree(b);
-}
-
-void crush_destroy_bucket_list(struct crush_bucket_list *b)
-{
-	kfree(b->item_weights);
-	kfree(b->sum_weights);
-	kfree(b->h.perm);
-	kfree(b->h.items);
-	kfree(b);
-}
-
-void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
-{
-	kfree(b->node_weights);
-	kfree(b);
-}
-
-void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
-{
-	kfree(b->straws);
-	kfree(b->item_weights);
-	kfree(b->h.perm);
-	kfree(b->h.items);
-	kfree(b);
-}
-
-void crush_destroy_bucket(struct crush_bucket *b)
-{
-	switch (b->alg) {
-	case CRUSH_BUCKET_UNIFORM:
-		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
-		break;
-	case CRUSH_BUCKET_LIST:
-		crush_destroy_bucket_list((struct crush_bucket_list *)b);
-		break;
-	case CRUSH_BUCKET_TREE:
-		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
-		break;
-	case CRUSH_BUCKET_STRAW:
-		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
-		break;
-	}
-}
-
-/**
- * crush_destroy - Destroy a crush_map
- * @map: crush_map pointer
- */
-void crush_destroy(struct crush_map *map)
-{
-	int b;
-
-	/* buckets */
-	if (map->buckets) {
-		for (b = 0; b < map->max_buckets; b++) {
-			if (map->buckets[b] == NULL)
-				continue;
-			crush_destroy_bucket(map->buckets[b]);
-		}
-		kfree(map->buckets);
-	}
-
-	/* rules */
-	if (map->rules) {
-		for (b = 0; b < map->max_rules; b++)
-			kfree(map->rules[b]);
-		kfree(map->rules);
-	}
-
-	kfree(map->bucket_parents);
-	kfree(map->device_parents);
-	kfree(map);
-}
-
-
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef CEPH_CRUSH_CRUSH_H
-#define CEPH_CRUSH_CRUSH_H
-
-#include <linux/types.h>
-
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-
-
-#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
-
-
-#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
-#define CRUSH_MAX_SET   10  /* max size of a mapping result */
-
-
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices.  A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
-	__u32 op;
-	__s32 arg1;
-	__s32 arg2;
-};
-
-/* step op codes */
-enum {
-	CRUSH_RULE_NOOP = 0,
-	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
-	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
-				      /* arg2 = type */
-	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
-	CRUSH_RULE_EMIT = 4,          /* no args */
-	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
-	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
-};
-
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N            0
-#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
-
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
-	__u8 ruleset;
-	__u8 type;
-	__u8 min_size;
-	__u8 max_size;
-};
-
-struct crush_rule {
-	__u32 len;
-	struct crush_rule_mask mask;
-	struct crush_rule_step steps[0];
-};
-
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
-			      (len)*sizeof(struct crush_rule_step))
-
-
-
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets).  Items within a bucket are chosen using one of a
- * few different algorithms.  The table summarizes how the speed of
- * each option measures up against mapping stability when items are
- * added or removed.
- *
- *  Bucket Alg     Speed       Additions    Removals
- *  ------------------------------------------------
- *  uniform         O(1)       poor         poor
- *  list            O(n)       optimal      poor
- *  tree            O(log n)   good         good
- *  straw           O(n)       optimal      optimal
- */
-enum {
-	CRUSH_BUCKET_UNIFORM = 1,
-	CRUSH_BUCKET_LIST = 2,
-	CRUSH_BUCKET_TREE = 3,
-	CRUSH_BUCKET_STRAW = 4
-};
-extern const char *crush_bucket_alg_name(int alg);
-
-struct crush_bucket {
-	__s32 id;        /* this'll be negative */
-	__u16 type;      /* non-zero; type=0 is reserved for devices */
-	__u8 alg;        /* one of CRUSH_BUCKET_* */
-	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
-	__u32 weight;    /* 16-bit fixed point */
-	__u32 size;      /* num items */
-	__s32 *items;
-
-	/*
-	 * cached random permutation: used for uniform bucket and for
-	 * the linear search fallback for the other bucket types.
-	 */
-	__u32 perm_x;  /* @x for which *perm is defined */
-	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
-	__u32 *perm;
-};
-
-struct crush_bucket_uniform {
-	struct crush_bucket h;
-	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
-};
-
-struct crush_bucket_list {
-	struct crush_bucket h;
-	__u32 *item_weights;  /* 16-bit fixed point */
-	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
-				 of weights 0..i, inclusive */
-};
-
-struct crush_bucket_tree {
-	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
-				   actual items */
-	__u8 num_nodes;
-	__u32 *node_weights;
-};
-
-struct crush_bucket_straw {
-	struct crush_bucket h;
-	__u32 *item_weights;   /* 16-bit fixed point */
-	__u32 *straws;         /* 16-bit fixed point */
-};
-
-
-
-/*
- * CRUSH map includes all buckets, rules, etc.
- */
-struct crush_map {
-	struct crush_bucket **buckets;
-	struct crush_rule **rules;
-
-	/*
-	 * Parent pointers to identify the parent bucket a device or
-	 * bucket in the hierarchy.  If an item appears more than
-	 * once, this is the _last_ time it appeared (where buckets
-	 * are processed in bucket id order, from -1 on down to
-	 * -max_buckets.
-	 */
-	__u32 *bucket_parents;
-	__u32 *device_parents;
-
-	__s32 max_buckets;
-	__u32 max_rules;
-	__s32 max_devices;
-};
-
-
-/* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket(struct crush_bucket *b);
-extern void crush_destroy(struct crush_map *map);
-
-#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
-
-#include <linux/types.h>
-#include "hash.h"
-
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do {			\
-		a = a-b;  a = a-c;  a = a^(c>>13);	\
-		b = b-c;  b = b-a;  b = b^(a<<8);	\
-		c = c-a;  c = c-b;  c = c^(b>>13);	\
-		a = a-b;  a = a-c;  a = a^(c>>12);	\
-		b = b-c;  b = b-a;  b = b^(a<<16);	\
-		c = c-a;  c = c-b;  c = c^(b>>5);	\
-		a = a-b;  a = a-c;  a = a^(c>>3);	\
-		b = b-c;  b = b-a;  b = b^(a<<10);	\
-		c = c-a;  c = c-b;  c = c^(b>>15);	\
-	} while (0)
-
-#define crush_hash_seed 1315423911
-
-static __u32 crush_hash32_rjenkins1(__u32 a)
-{
-	__u32 hash = crush_hash_seed ^ a;
-	__u32 b = a;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, a, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(x, a, hash);
-	crush_hashmix(b, y, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, x, hash);
-	crush_hashmix(y, a, hash);
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, c, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, d, hash);
-	crush_hashmix(a, x, hash);
-	crush_hashmix(y, b, hash);
-	crush_hashmix(c, x, hash);
-	crush_hashmix(y, d, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
-				      __u32 e)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, d, hash);
-	crush_hashmix(e, x, hash);
-	crush_hashmix(y, a, hash);
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, c, hash);
-	crush_hashmix(d, x, hash);
-	crush_hashmix(y, e, hash);
-	return hash;
-}
-
-
-__u32 crush_hash32(int type, __u32 a)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1(a);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_2(int type, __u32 a, __u32 b)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_2(a, b);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_3(a, b, c);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_4(a, b, c, d);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_5(a, b, c, d, e);
-	default:
-		return 0;
-	}
-}
-
-const char *crush_hash_name(int type)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return "rjenkins1";
-	default:
-		return "unknown";
-	}
-}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CEPH_CRUSH_HASH_H
-#define CEPH_CRUSH_HASH_H
-
-#define CRUSH_HASH_RJENKINS1   0
-
-#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
-
-extern const char *crush_hash_name(int type);
-
-extern __u32 crush_hash32(int type, __u32 a);
-extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
-extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
-extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
-			    __u32 e);
-
-#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/string.h>
-# include <linux/slab.h>
-# include <linux/bug.h>
-# include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
-#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-#endif
-
-#include "crush.h"
-#include "hash.h"
-
-/*
- * Implement the core CRUSH mapping algorithm.
- */
-
-/**
- * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
- * @map: the crush_map
- * @ruleset: the storage ruleset id (user defined)
- * @type: storage ruleset type (user defined)
- * @size: output set size
- */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
-{
-	int i;
-
-	for (i = 0; i < map->max_rules; i++) {
-		if (map->rules[i] &&
-		    map->rules[i]->mask.ruleset == ruleset &&
-		    map->rules[i]->mask.type == type &&
-		    map->rules[i]->mask.min_size <= size &&
-		    map->rules[i]->mask.max_size >= size)
-			return i;
-	}
-	return -1;
-}
-
-
-/*
- * bucket choose methods
- *
- * For each bucket algorithm, we have a "choose" method that, given a
- * crush input @x and replica position (usually, position in output set) @r,
- * will produce an item in the bucket.
- */
-
-/*
- * Choose based on a random permutation of the bucket.
- *
- * We used to use some prime number arithmetic to do this, but it
- * wasn't very random, and had some other bad behaviors.  Instead, we
- * calculate an actual random permutation of the bucket members.
- * Since this is expensive, we optimize for the r=0 case, which
- * captures the vast majority of calls.
- */
-static int bucket_perm_choose(struct crush_bucket *bucket,
-			      int x, int r)
-{
-	unsigned pr = r % bucket->size;
-	unsigned i, s;
-
-	/* start a new permutation if @x has changed */
-	if (bucket->perm_x != x || bucket->perm_n == 0) {
-		dprintk("bucket %d new x=%d\n", bucket->id, x);
-		bucket->perm_x = x;
-
-		/* optimize common r=0 case */
-		if (pr == 0) {
-			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
-				bucket->size;
-			bucket->perm[0] = s;
-			bucket->perm_n = 0xffff;   /* magic value, see below */
-			goto out;
-		}
-
-		for (i = 0; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm_n = 0;
-	} else if (bucket->perm_n == 0xffff) {
-		/* clean up after the r=0 case above */
-		for (i = 1; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm[bucket->perm[0]] = 0;
-		bucket->perm_n = 1;
-	}
-
-	/* calculate permutation up to pr */
-	for (i = 0; i < bucket->perm_n; i++)
-		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-	while (bucket->perm_n <= pr) {
-		unsigned p = bucket->perm_n;
-		/* no point in swapping the final entry */
-		if (p < bucket->size - 1) {
-			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
-				(bucket->size - p);
-			if (i) {
-				unsigned t = bucket->perm[p + i];
-				bucket->perm[p + i] = bucket->perm[p];
-				bucket->perm[p] = t;
-			}
-			dprintk(" perm_choose swap %d with %d\n", p, p+i);
-		}
-		bucket->perm_n++;
-	}
-	for (i = 0; i < bucket->size; i++)
-		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
-
-	s = bucket->perm[pr];
-out:
-	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
-		bucket->size, x, r, pr, s);
-	return bucket->items[s];
-}
-
-/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-				 int x, int r)
-{
-	return bucket_perm_choose(&bucket->h, x, r);
-}
-
-/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
-			      int x, int r)
-{
-	int i;
-
-	for (i = bucket->h.size-1; i >= 0; i--) {
-		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
-					 r, bucket->h.id);
-		w &= 0xffff;
-		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
-			"sw %x rand %llx",
-			i, x, r, bucket->h.items[i], bucket->item_weights[i],
-			bucket->sum_weights[i], w);
-		w *= bucket->sum_weights[i];
-		w = w >> 16;
-		/*dprintk(" scaled %llx\n", w);*/
-		if (w < bucket->item_weights[i])
-			return bucket->h.items[i];
-	}
-
-	BUG_ON(1);
-	return 0;
-}
-
-
-/* (binary) tree */
-static int height(int n)
-{
-	int h = 0;
-	while ((n & 1) == 0) {
-		h++;
-		n = n >> 1;
-	}
-	return h;
-}
-
-static int left(int x)
-{
-	int h = height(x);
-	return x - (1 << (h-1));
-}
-
-static int right(int x)
-{
-	int h = height(x);
-	return x + (1 << (h-1));
-}
-
-static int terminal(int x)
-{
-	return x & 1;
-}
-
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
-			      int x, int r)
-{
-	int n, l;
-	__u32 w;
-	__u64 t;
-
-	/* start at root */
-	n = bucket->num_nodes >> 1;
-
-	while (!terminal(n)) {
-		/* pick point in [0, w) */
-		w = bucket->node_weights[n];
-		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
-					  bucket->h.id) * (__u64)w;
-		t = t >> 32;
-
-		/* descend to the left or right? */
-		l = left(n);
-		if (t < bucket->node_weights[l])
-			n = l;
-		else
-			n = right(n);
-	}
-
-	return bucket->h.items[n >> 1];
-}
-
-
-/* straw */
-
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
-			       int x, int r)
-{
-	int i;
-	int high = 0;
-	__u64 high_draw = 0;
-	__u64 draw;
-
-	for (i = 0; i < bucket->h.size; i++) {
-		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
-		draw &= 0xffff;
-		draw *= bucket->straws[i];
-		if (i == 0 || draw > high_draw) {
-			high = i;
-			high_draw = draw;
-		}
-	}
-	return bucket->h.items[high];
-}
-
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
-{
-	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
-	switch (in->alg) {
-	case CRUSH_BUCKET_UNIFORM:
-		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-					  x, r);
-	case CRUSH_BUCKET_LIST:
-		return bucket_list_choose((struct crush_bucket_list *)in,
-					  x, r);
-	case CRUSH_BUCKET_TREE:
-		return bucket_tree_choose((struct crush_bucket_tree *)in,
-					  x, r);
-	case CRUSH_BUCKET_STRAW:
-		return bucket_straw_choose((struct crush_bucket_straw *)in,
-					   x, r);
-	default:
-		BUG_ON(1);
-		return in->items[0];
-	}
-}
-
-/*
- * true if device is marked "out" (failed, fully offloaded)
- * of the cluster
- */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
-{
-	if (weight[item] >= 0x10000)
-		return 0;
-	if (weight[item] == 0)
-		return 1;
-	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
-	    < weight[item])
-		return 0;
-	return 1;
-}
-
-/**
- * crush_choose - choose numrep distinct items of given type
- * @map: the crush_map
- * @bucket: the bucket we are choose an item from
- * @x: crush input value
- * @numrep: the number of items to choose
- * @type: the type of item to choose
- * @out: pointer to output vector
- * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @out2: second output vector for leaf items (if @recurse_to_leaf)
- */
-static int crush_choose(struct crush_map *map,
-			struct crush_bucket *bucket,
-			__u32 *weight,
-			int x, int numrep, int type,
-			int *out, int outpos,
-			int firstn, int recurse_to_leaf,
-			int *out2)
-{
-	int rep;
-	int ftotal, flocal;
-	int retry_descent, retry_bucket, skip_rep;
-	struct crush_bucket *in = bucket;
-	int r;
-	int i;
-	int item = 0;
-	int itemtype;
-	int collide, reject;
-	const int orig_tries = 5; /* attempts before we fall back to search */
-
-	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
-		bucket->id, x, outpos, numrep);
-
-	for (rep = outpos; rep < numrep; rep++) {
-		/* keep trying until we get a non-out, non-colliding item */
-		ftotal = 0;
-		skip_rep = 0;
-		do {
-			retry_descent = 0;
-			in = bucket;               /* initial bucket */
-
-			/* choose through intervening buckets */
-			flocal = 0;
-			do {
-				collide = 0;
-				retry_bucket = 0;
-				r = rep;
-				if (in->alg == CRUSH_BUCKET_UNIFORM) {
-					/* be careful */
-					if (firstn || numrep >= in->size)
-						/* r' = r + f_total */
-						r += ftotal;
-					else if (in->size % numrep == 0)
-						/* r'=r+(n+1)*f_local */
-						r += (numrep+1) *
-							(flocal+ftotal);
-					else
-						/* r' = r + n*f_local */
-						r += numrep * (flocal+ftotal);
-				} else {
-					if (firstn)
-						/* r' = r + f_total */
-						r += ftotal;
-					else
-						/* r' = r + n*f_local */
-						r += numrep * (flocal+ftotal);
-				}
-
-				/* bucket choose */
-				if (in->size == 0) {
-					reject = 1;
-					goto reject;
-				}
-				if (flocal >= (in->size>>1) &&
-				    flocal > orig_tries)
-					item = bucket_perm_choose(in, x, r);
-				else
-					item = crush_bucket_choose(in, x, r);
-				BUG_ON(item >= map->max_devices);
-
-				/* desired type? */
-				if (item < 0)
-					itemtype = map->buckets[-1-item]->type;
-				else
-					itemtype = 0;
-				dprintk("  item %d type %d\n", item, itemtype);
-
-				/* keep going? */
-				if (itemtype != type) {
-					BUG_ON(item >= 0 ||
-					       (-1-item) >= map->max_buckets);
-					in = map->buckets[-1-item];
-					retry_bucket = 1;
-					continue;
-				}
-
-				/* collision? */
-				for (i = 0; i < outpos; i++) {
-					if (out[i] == item) {
-						collide = 1;
-						break;
-					}
-				}
-
-				reject = 0;
-				if (recurse_to_leaf) {
-					if (item < 0) {
-						if (crush_choose(map,
-							 map->buckets[-1-item],
-							 weight,
-							 x, outpos+1, 0,
-							 out2, outpos,
-							 firstn, 0,
-							 NULL) <= outpos)
-							/* didn't get leaf */
-							reject = 1;
-					} else {
-						/* we already have a leaf! */
-						out2[outpos] = item;
-					}
-				}
-
-				if (!reject) {
-					/* out? */
-					if (itemtype == 0)
-						reject = is_out(map, weight,
-								item, x);
-					else
-						reject = 0;
-				}
-
-reject:
-				if (reject || collide) {
-					ftotal++;
-					flocal++;
-
-					if (collide && flocal < 3)
-						/* retry locally a few times */
-						retry_bucket = 1;
-					else if (flocal < in->size + orig_tries)
-						/* exhaustive bucket search */
-						retry_bucket = 1;
-					else if (ftotal < 20)
-						/* then retry descent */
-						retry_descent = 1;
-					else
-						/* else give up */
-						skip_rep = 1;
-					dprintk("  reject %d  collide %d  "
-						"ftotal %d  flocal %d\n",
-						reject, collide, ftotal,
-						flocal);
-				}
-			} while (retry_bucket);
-		} while (retry_descent);
-
-		if (skip_rep) {
-			dprintk("skip rep\n");
-			continue;
-		}
-
-		dprintk("CHOOSE got %d\n", item);
-		out[outpos] = item;
-		outpos++;
-	}
-
-	dprintk("CHOOSE returns %d\n", outpos);
-	return outpos;
-}
-
-
-/**
- * crush_do_rule - calculate a mapping with the given input and rule
- * @map: the crush_map
- * @ruleno: the rule id
- * @x: hash input
- * @result: pointer to result vector
- * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
- */
-int crush_do_rule(struct crush_map *map,
-		  int ruleno, int x, int *result, int result_max,
-		  int force, __u32 *weight)
-{
-	int result_len;
-	int force_context[CRUSH_MAX_DEPTH];
-	int force_pos = -1;
-	int a[CRUSH_MAX_SET];
-	int b[CRUSH_MAX_SET];
-	int c[CRUSH_MAX_SET];
-	int recurse_to_leaf;
-	int *w;
-	int wsize = 0;
-	int *o;
-	int osize;
-	int *tmp;
-	struct crush_rule *rule;
-	int step;
-	int i, j;
-	int numrep;
-	int firstn;
-	int rc = -1;
-
-	BUG_ON(ruleno >= map->max_rules);
-
-	rule = map->rules[ruleno];
-	result_len = 0;
-	w = a;
-	o = b;
-
-	/*
-	 * determine hierarchical context of force, if any.  note
-	 * that this may or may not correspond to the specific types
-	 * referenced by the crush rule.
-	 */
-	if (force >= 0) {
-		if (force >= map->max_devices ||
-		    map->device_parents[force] == 0) {
-			/*dprintk("CRUSH: forcefed device dne\n");*/
-			rc = -1;  /* force fed device dne */
-			goto out;
-		}
-		if (!is_out(map, weight, force, x)) {
-			while (1) {
-				force_context[++force_pos] = force;
-				if (force >= 0)
-					force = map->device_parents[force];
-				else
-					force = map->bucket_parents[-1-force];
-				if (force == 0)
-					break;
-			}
-		}
-	}
-
-	for (step = 0; step < rule->len; step++) {
-		firstn = 0;
-		switch (rule->steps[step].op) {
-		case CRUSH_RULE_TAKE:
-			w[0] = rule->steps[step].arg1;
-			if (force_pos >= 0) {
-				BUG_ON(force_context[force_pos] != w[0]);
-				force_pos--;
-			}
-			wsize = 1;
-			break;
-
-		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
-		case CRUSH_RULE_CHOOSE_FIRSTN:
-			firstn = 1;
-		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
-		case CRUSH_RULE_CHOOSE_INDEP:
-			BUG_ON(wsize == 0);
-
-			recurse_to_leaf =
-				rule->steps[step].op ==
-				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
-				rule->steps[step].op ==
-				CRUSH_RULE_CHOOSE_LEAF_INDEP;
-
-			/* reset output */
-			osize = 0;
-
-			for (i = 0; i < wsize; i++) {
-				/*
-				 * see CRUSH_N, CRUSH_N_MINUS macros.
-				 * basically, numrep <= 0 means relative to
-				 * the provided result_max
-				 */
-				numrep = rule->steps[step].arg1;
-				if (numrep <= 0) {
-					numrep += result_max;
-					if (numrep <= 0)
-						continue;
-				}
-				j = 0;
-				if (osize == 0 && force_pos >= 0) {
-					/* skip any intermediate types */
-					while (force_pos &&
-					       force_context[force_pos] < 0 &&
-					       rule->steps[step].arg2 !=
-					       map->buckets[-1 -
-					       force_context[force_pos]]->type)
-						force_pos--;
-					o[osize] = force_context[force_pos];
-					if (recurse_to_leaf)
-						c[osize] = force_context[0];
-					j++;
-					force_pos--;
-				}
-				osize += crush_choose(map,
-						      map->buckets[-1-w[i]],
-						      weight,
-						      x, numrep,
-						      rule->steps[step].arg2,
-						      o+osize, j,
-						      firstn,
-						      recurse_to_leaf, c+osize);
-			}
-
-			if (recurse_to_leaf)
-				/* copy final _leaf_ values to output set */
-				memcpy(o, c, osize*sizeof(*o));
-
-			/* swap t and w arrays */
-			tmp = o;
-			o = w;
-			w = tmp;
-			wsize = osize;
-			break;
-
-
-		case CRUSH_RULE_EMIT:
-			for (i = 0; i < wsize && result_len < result_max; i++) {
-				result[result_len] = w[i];
-				result_len++;
-			}
-			wsize = 0;
-			break;
-
-		default:
-			BUG_ON(1);
-		}
-	}
-	rc = result_len;
-
-out:
-	return rc;
-}
-
-
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef CEPH_CRUSH_MAPPER_H
-#define CEPH_CRUSH_MAPPER_H
-
-/*
- * CRUSH functions for find rules and then mapping an input to an
- * output set.
- *
- * LGPL2
- */
-
-#include "crush.h"
-
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
-			 int ruleno,
-			 int x, int *result, int result_max,
-			 int forcefeed,    /* -1 for none */
-			 __u32 *weights);
-
-#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <crypto/hash.h>
-
-#include "crypto.h"
-#include "decode.h"
-
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
-	if (*p + sizeof(u16) + sizeof(key->created) +
-	    sizeof(u16) + key->len > end)
-		return -ERANGE;
-	ceph_encode_16(p, key->type);
-	ceph_encode_copy(p, &key->created, sizeof(key->created));
-	ceph_encode_16(p, key->len);
-	ceph_encode_copy(p, key->key, key->len);
-	return 0;
-}
-
-int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
-{
-	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
-	key->type = ceph_decode_16(p);
-	ceph_decode_copy(p, &key->created, sizeof(key->created));
-	key->len = ceph_decode_16(p);
-	ceph_decode_need(p, end, key->len, bad);
-	key->key = kmalloc(key->len, GFP_NOFS);
-	if (!key->key)
-		return -ENOMEM;
-	ceph_decode_copy(p, key->key, key->len);
-	return 0;
-
-bad:
-	dout("failed to decode crypto key\n");
-	return -EINVAL;
-}
-
-int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
-{
-	int inlen = strlen(inkey);
-	int blen = inlen * 3 / 4;
-	void *buf, *p;
-	int ret;
-
-	dout("crypto_key_unarmor %s\n", inkey);
-	buf = kmalloc(blen, GFP_NOFS);
-	if (!buf)
-		return -ENOMEM;
-	blen = ceph_unarmor(buf, inkey, inkey+inlen);
-	if (blen < 0) {
-		kfree(buf);
-		return blen;
-	}
-
-	p = buf;
-	ret = ceph_crypto_key_decode(key, &p, p + blen);
-	kfree(buf);
-	if (ret)
-		return ret;
-	dout("crypto_key_unarmor key %p type %d len %d\n", key,
-	     key->type, key->len);
-	return 0;
-}
-
-
-
-#define AES_KEY_SIZE 16
-
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
-{
-	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-}
-
-static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-
-static int ceph_aes_encrypt(const void *key, int key_len,
-			    void *dst, size_t *dst_len,
-			    const void *src, size_t src_len)
-{
-	struct scatterlist sg_in[2], sg_out[1];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-	int ret;
-	void *iv;
-	int ivsize;
-	size_t zero_padding = (0x10 - (src_len & 0x0f));
-	char pad[16];
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	memset(pad, zero_padding, zero_padding);
-
-	*dst_len = src_len + zero_padding;
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 2);
-	sg_set_buf(&sg_in[0], src, src_len);
-	sg_set_buf(&sg_in[1], pad, zero_padding);
-	sg_init_table(sg_out, 1);
-	sg_set_buf(sg_out, dst, *dst_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-	/*
-	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
-			src, src_len, 1);
-	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
-			pad, zero_padding, 1);
-	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-				     src_len + zero_padding);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0)
-		pr_err("ceph_aes_crypt failed %d\n", ret);
-	/*
-	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst, *dst_len, 1);
-	*/
-	return 0;
-}
-
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
-			     size_t *dst_len,
-			     const void *src1, size_t src1_len,
-			     const void *src2, size_t src2_len)
-{
-	struct scatterlist sg_in[3], sg_out[1];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-	int ret;
-	void *iv;
-	int ivsize;
-	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
-	char pad[16];
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	memset(pad, zero_padding, zero_padding);
-
-	*dst_len = src1_len + src2_len + zero_padding;
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 3);
-	sg_set_buf(&sg_in[0], src1, src1_len);
-	sg_set_buf(&sg_in[1], src2, src2_len);
-	sg_set_buf(&sg_in[2], pad, zero_padding);
-	sg_init_table(sg_out, 1);
-	sg_set_buf(sg_out, dst, *dst_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-	/*
-	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
-			src1, src1_len, 1);
-	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
-			src2, src2_len, 1);
-	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
-			pad, zero_padding, 1);
-	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-				     src1_len + src2_len + zero_padding);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0)
-		pr_err("ceph_aes_crypt2 failed %d\n", ret);
-	/*
-	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst, *dst_len, 1);
-	*/
-	return 0;
-}
-
-static int ceph_aes_decrypt(const void *key, int key_len,
-			    void *dst, size_t *dst_len,
-			    const void *src, size_t src_len)
-{
-	struct scatterlist sg_in[1], sg_out[2];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm };
-	char pad[16];
-	void *iv;
-	int ivsize;
-	int ret;
-	int last_byte;
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 1);
-	sg_init_table(sg_out, 2);
-	sg_set_buf(sg_in, src, src_len);
-	sg_set_buf(&sg_out[0], dst, *dst_len);
-	sg_set_buf(&sg_out[1], pad, sizeof(pad));
-
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-
-	/*
-	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
-		       src, src_len, 1);
-	*/
-
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0) {
-		pr_err("ceph_aes_decrypt failed %d\n", ret);
-		return ret;
-	}
-
-	if (src_len <= *dst_len)
-		last_byte = ((char *)dst)[src_len - 1];
-	else
-		last_byte = pad[src_len - *dst_len - 1];
-	if (last_byte <= 16 && src_len >= last_byte) {
-		*dst_len = src_len - last_byte;
-	} else {
-		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-		       last_byte, (int)src_len);
-		return -EPERM;  /* bad padding */
-	}
-	/*
-	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst, *dst_len, 1);
-	*/
-	return 0;
-}
-
-static int ceph_aes_decrypt2(const void *key, int key_len,
-			     void *dst1, size_t *dst1_len,
-			     void *dst2, size_t *dst2_len,
-			     const void *src, size_t src_len)
-{
-	struct scatterlist sg_in[1], sg_out[3];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm };
-	char pad[16];
-	void *iv;
-	int ivsize;
-	int ret;
-	int last_byte;
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	sg_init_table(sg_in, 1);
-	sg_set_buf(sg_in, src, src_len);
-	sg_init_table(sg_out, 3);
-	sg_set_buf(&sg_out[0], dst1, *dst1_len);
-	sg_set_buf(&sg_out[1], dst2, *dst2_len);
-	sg_set_buf(&sg_out[2], pad, sizeof(pad));
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-
-	/*
-	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
-		       src, src_len, 1);
-	*/
-
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0) {
-		pr_err("ceph_aes_decrypt failed %d\n", ret);
-		return ret;
-	}
-
-	if (src_len <= *dst1_len)
-		last_byte = ((char *)dst1)[src_len - 1];
-	else if (src_len <= *dst1_len + *dst2_len)
-		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
-	else
-		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
-	if (last_byte <= 16 && src_len >= last_byte) {
-		src_len -= last_byte;
-	} else {
-		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-		       last_byte, (int)src_len);
-		return -EPERM;  /* bad padding */
-	}
-
-	if (src_len < *dst1_len) {
-		*dst1_len = src_len;
-		*dst2_len = 0;
-	} else {
-		*dst2_len = src_len - *dst1_len;
-	}
-	/*
-	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst1, *dst1_len, 1);
-	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst2, *dst2_len, 1);
-	*/
-
-	return 0;
-}
-
-
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-		 const void *src, size_t src_len)
-{
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst_len < src_len)
-			return -ERANGE;
-		memcpy(dst, src, src_len);
-		*dst_len = src_len;
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_decrypt(secret->key, secret->len, dst,
-					dst_len, src, src_len);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-int ceph_decrypt2(struct ceph_crypto_key *secret,
-			void *dst1, size_t *dst1_len,
-			void *dst2, size_t *dst2_len,
-			const void *src, size_t src_len)
-{
-	size_t t;
-
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst1_len + *dst2_len < src_len)
-			return -ERANGE;
-		t = min(*dst1_len, src_len);
-		memcpy(dst1, src, t);
-		*dst1_len = t;
-		src += t;
-		src_len -= t;
-		if (src_len) {
-			t = min(*dst2_len, src_len);
-			memcpy(dst2, src, t);
-			*dst2_len = t;
-		}
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_decrypt2(secret->key, secret->len,
-					 dst1, dst1_len, dst2, dst2_len,
-					 src, src_len);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-		 const void *src, size_t src_len)
-{
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst_len < src_len)
-			return -ERANGE;
-		memcpy(dst, src, src_len);
-		*dst_len = src_len;
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_encrypt(secret->key, secret->len, dst,
-					dst_len, src, src_len);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-		  const void *src1, size_t src1_len,
-		  const void *src2, size_t src2_len)
-{
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst_len < src1_len + src2_len)
-			return -ERANGE;
-		memcpy(dst, src1, src1_len);
-		memcpy(dst + src1_len, src2, src2_len);
-		*dst_len = src1_len + src2_len;
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
-					 src1, src1_len, src2, src2_len);
-
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _FS_CEPH_CRYPTO_H
-#define _FS_CEPH_CRYPTO_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * cryptographic secret
- */
-struct ceph_crypto_key {
-	int type;
-	struct ceph_timespec created;
-	int len;
-	void *key;
-};
-
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
-	kfree(key->key);
-}
-
-extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
-				  void **p, void *end);
-extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
-				  void **p, void *end);
-extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
-
-/* crypto.c */
-extern int ceph_decrypt(struct ceph_crypto_key *secret,
-			void *dst, size_t *dst_len,
-			const void *src, size_t src_len);
-extern int ceph_encrypt(struct ceph_crypto_key *secret,
-			void *dst, size_t *dst_len,
-			const void *src, size_t src_len);
-extern int ceph_decrypt2(struct ceph_crypto_key *secret,
-			void *dst1, size_t *dst1_len,
-			void *dst2, size_t *dst2_len,
-			const void *src, size_t src_len);
-extern int ceph_encrypt2(struct ceph_crypto_key *secret,
-			 void *dst, size_t *dst_len,
-			 const void *src1, size_t src1_len,
-			 const void *src2, size_t src2_len);
-
-/* armor.c */
-extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(char *dst, const char *src, const char *end);
-
-#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..94fe1d284c3a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/device.h>
 #include <linux/slab.h>
@@ -7,143 +7,48 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 #ifdef CONFIG_DEBUG_FS
 
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
-	int i;
-	struct ceph_client *client = s->private;
-
-	if (client->monc.monmap == NULL)
-		return 0;
-
-	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-	for (i = 0; i < client->monc.monmap->num_mon; i++) {
-		struct ceph_entity_inst *inst =
-			&client->monc.monmap->mon_inst[i];
-
-		seq_printf(s, "\t%s%lld\t%s\n",
-			   ENTITY_NAME(inst->name),
-			   pr_addr(&inst->addr.in_addr));
-	}
-	return 0;
-}
+#include "super.h"
+#include "mds_client.h"
 
 static int mdsmap_show(struct seq_file *s, void *p)
 {
 	int i;
-	struct ceph_client *client = s->private;
+	struct ceph_fs_client *fsc = s->private;
 
-	if (client->mdsc.mdsmap == NULL)
+	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
 		return 0;
-	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
-	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
 	seq_printf(s, "session_timeout %d\n",
-		       client->mdsc.mdsmap->m_session_timeout);
+		       fsc->mdsc->mdsmap->m_session_timeout);
 	seq_printf(s, "session_autoclose %d\n",
-		       client->mdsc.mdsmap->m_session_autoclose);
-	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+		       fsc->mdsc->mdsmap->m_session_autoclose);
+	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
 		struct ceph_entity_addr *addr =
-			&client->mdsc.mdsmap->m_info[i].addr;
-		int state = client->mdsc.mdsmap->m_info[i].state;
+			&fsc->mdsc->mdsmap->m_info[i].addr;
+		int state = fsc->mdsc->mdsmap->m_info[i].state;
 
-		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+			       ceph_pr_addr(&addr->in_addr),
 			       ceph_mds_state_name(state));
 	}
 	return 0;
 }
 
-static int osdmap_show(struct seq_file *s, void *p)
-{
-	int i;
-	struct ceph_client *client = s->private;
-	struct rb_node *n;
-
-	if (client->osdc.osdmap == NULL)
-		return 0;
-	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-	seq_printf(s, "flags%s%s\n",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-		   " NEARFULL" : "",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-		   " FULL" : "");
-	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-		struct ceph_pg_pool_info *pool =
-			rb_entry(n, struct ceph_pg_pool_info, node);
-		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-			   pool->id, pool->v.pg_num, pool->pg_num_mask,
-			   pool->v.lpg_num, pool->lpg_num_mask);
-	}
-	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-		struct ceph_entity_addr *addr =
-			&client->osdc.osdmap->osd_addr[i];
-		int state = client->osdc.osdmap->osd_state[i];
-		char sb[64];
-
-		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-			   i, pr_addr(&addr->in_addr),
-			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-			   ceph_osdmap_state_str(sb, sizeof(sb), state));
-	}
-	return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
-	struct ceph_client *client = s->private;
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_client *monc = &client->monc;
-	struct rb_node *rp;
-
-	mutex_lock(&monc->mutex);
-
-	if (monc->have_mdsmap)
-		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-	if (monc->have_osdmap)
-		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-	if (monc->want_next_osdmap)
-		seq_printf(s, "want next osdmap\n");
-
-	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-		__u16 op;
-		req = rb_entry(rp, struct ceph_mon_generic_request, node);
-		op = le16_to_cpu(req->request->hdr.type);
-		if (op == CEPH_MSG_STATFS)
-			seq_printf(s, "%lld statfs\n", req->tid);
-		else
-			seq_printf(s, "%lld unknown\n", req->tid);
-	}
-
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
 static int mdsc_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct rb_node *rp;
 	int pathlen;
@@ -214,61 +119,12 @@ static int mdsc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int osdc_show(struct seq_file *s, void *pp)
-{
-	struct ceph_client *client = s->private;
-	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
-		struct ceph_osd_request_head *head;
-		struct ceph_osd_op *op;
-		int num_ops;
-		int opcode, olen;
-		int i;
-
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   le32_to_cpu(req->r_pgid.pool),
-			   le16_to_cpu(req->r_pgid.ps));
-
-		head = req->r_request->front.iov_base;
-		op = (void *)(head + 1);
-
-		num_ops = le16_to_cpu(head->num_ops);
-		olen = le32_to_cpu(head->object_len);
-		seq_printf(s, "%.*s", olen,
-			   (const char *)(head->ops + num_ops));
-
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
-
-		for (i = 0; i < num_ops; i++) {
-			opcode = le16_to_cpu(op->op);
-			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-			op++;
-		}
-
-		seq_printf(s, "\n");
-	}
-	mutex_unlock(&osdc->request_mutex);
-	return 0;
-}
-
 static int caps_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = s->private;
+	struct ceph_fs_client *fsc = s->private;
 	int total, avail, used, reserved, min;
 
-	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+	ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
 	seq_printf(s, "total\t\t%d\n"
 		   "avail\t\t%d\n"
 		   "used\t\t%d\n"
@@ -280,8 +136,8 @@ static int caps_show(struct seq_file *s, void *p)
 
 static int dentry_lru_show(struct seq_file *s, void *ptr)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_dentry_info *di;
 
 	spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +151,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
 	return 0;
 }
 
-#define DEFINE_SHOW_FUNC(name)						\
-static int name##_open(struct inode *inode, struct file *file)		\
-{									\
-	struct seq_file *sf;						\
-	int ret;							\
-									\
-	ret = single_open(file, name, NULL);				\
-	sf = file->private_data;					\
-	sf->private = inode->i_private;					\
-	return ret;							\
-}									\
-									\
-static const struct file_operations name##_fops = {			\
-	.open		= name##_open,					\
-	.read		= seq_read,					\
-	.llseek		= seq_lseek,					\
-	.release	= single_release,				\
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
 
+/*
+ * debugfs
+ */
 static int congestion_kb_set(void *data, u64 val)
 {
-	struct ceph_client *client = (struct ceph_client *)data;
-
-	if (client)
-		client->mount_args->congestion_kb = (int)val;
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
 
+	fsc->mount_options->congestion_kb = (int)val;
 	return 0;
 }
 
 static int congestion_kb_get(void *data, u64 *val)
 {
-	struct ceph_client *client = (struct ceph_client *)data;
-
-	if (client)
-		*val = (u64)client->mount_args->congestion_kb;
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
 
+	*val = (u64)fsc->mount_options->congestion_kb;
 	return 0;
 }
 
-
 DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
 			congestion_kb_set, "%llu\n");
 
-int __init ceph_debugfs_init(void)
-{
-	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-	if (!ceph_debugfs_dir)
-		return -ENOMEM;
-	return 0;
-}
 
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
-	debugfs_remove(ceph_debugfs_dir);
+	dout("ceph_fs_debugfs_cleanup\n");
+	debugfs_remove(fsc->debugfs_bdi);
+	debugfs_remove(fsc->debugfs_congestion_kb);
+	debugfs_remove(fsc->debugfs_mdsmap);
+	debugfs_remove(fsc->debugfs_caps);
+	debugfs_remove(fsc->debugfs_mdsc);
+	debugfs_remove(fsc->debugfs_dentry_lru);
 }
 
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-	int ret = 0;
-	char name[80];
-
-	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-		 client->monc.auth->global_id);
-
-	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-	if (!client->debugfs_dir)
-		goto out;
+	char name[100];
+	int err = -ENOMEM;
 
-	client->monc.debugfs_file = debugfs_create_file("monc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &monc_show_fops);
-	if (!client->monc.debugfs_file)
-		goto out;
-
-	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &mdsc_show_fops);
-	if (!client->mdsc.debugfs_file)
+	dout("ceph_fs_debugfs_init\n");
+	fsc->debugfs_congestion_kb =
+		debugfs_create_file("writeback_congestion_kb",
+				    0600,
+				    fsc->client->debugfs_dir,
+				    fsc,
+				    &congestion_kb_fops);
+	if (!fsc->debugfs_congestion_kb)
 		goto out;
 
-	client->osdc.debugfs_file = debugfs_create_file("osdc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &osdc_show_fops);
-	if (!client->osdc.debugfs_file)
-		goto out;
+	dout("a\n");
 
-	client->debugfs_monmap = debugfs_create_file("monmap",
-					0600,
-					client->debugfs_dir,
-					client,
-					&monmap_show_fops);
-	if (!client->debugfs_monmap)
+	snprintf(name, sizeof(name), "../../bdi/%s",
+		 dev_name(fsc->backing_dev_info.dev));
+	fsc->debugfs_bdi =
+		debugfs_create_symlink("bdi",
+				       fsc->client->debugfs_dir,
+				       name);
+	if (!fsc->debugfs_bdi)
 		goto out;
 
-	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+	dout("b\n");
+	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
 					0600,
-					client->debugfs_dir,
-					client,
+					fsc->client->debugfs_dir,
+					fsc,
 					&mdsmap_show_fops);
-	if (!client->debugfs_mdsmap)
+	if (!fsc->debugfs_mdsmap)
 		goto out;
 
-	client->debugfs_osdmap = debugfs_create_file("osdmap",
-					0600,
-					client->debugfs_dir,
-					client,
-					&osdmap_show_fops);
-	if (!client->debugfs_osdmap)
+	dout("ca\n");
+	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+						0600,
+						fsc->client->debugfs_dir,
+						fsc,
+						&mdsc_show_fops);
+	if (!fsc->debugfs_mdsc)
 		goto out;
 
-	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-					0600,
-					client->debugfs_dir,
-					client,
-					&dentry_lru_show_fops);
-	if (!client->debugfs_dentry_lru)
-		goto out;
-
-	client->debugfs_caps = debugfs_create_file("caps",
+	dout("da\n");
+	fsc->debugfs_caps = debugfs_create_file("caps",
 						   0400,
-						   client->debugfs_dir,
-						   client,
+						   fsc->client->debugfs_dir,
+						   fsc,
 						   &caps_show_fops);
-	if (!client->debugfs_caps)
+	if (!fsc->debugfs_caps)
 		goto out;
 
-	client->debugfs_congestion_kb =
-		debugfs_create_file("writeback_congestion_kb",
-				    0600,
-				    client->debugfs_dir,
-				    client,
-				    &congestion_kb_fops);
-	if (!client->debugfs_congestion_kb)
+	dout("ea\n");
+	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&dentry_lru_show_fops);
+	if (!fsc->debugfs_dentry_lru)
 		goto out;
 
-	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-						     name);
-
 	return 0;
 
 out:
-	ceph_debugfs_client_cleanup(client);
-	return ret;
+	ceph_fs_debugfs_cleanup(fsc);
+	return err;
 }
 
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-	debugfs_remove(client->debugfs_bdi);
-	debugfs_remove(client->debugfs_caps);
-	debugfs_remove(client->debugfs_dentry_lru);
-	debugfs_remove(client->debugfs_osdmap);
-	debugfs_remove(client->debugfs_mdsmap);
-	debugfs_remove(client->debugfs_monmap);
-	debugfs_remove(client->osdc.debugfs_file);
-	debugfs_remove(client->mdsc.debugfs_file);
-	debugfs_remove(client->monc.debugfs_file);
-	debugfs_remove(client->debugfs_congestion_kb);
-	debugfs_remove(client->debugfs_dir);
-}
 
 #else  /* CONFIG_DEBUG_FS */
 
-int __init ceph_debugfs_init(void)
-{
-	return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	return 0;
 }
 
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
 }
 
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index c5b6939fb32a..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-
-#include <asm/unaligned.h>
-#include <linux/time.h>
-
-#include "types.h"
-
-/*
- * in all cases,
- *   void **p     pointer to position pointer
- *   void *end    pointer to end of buffer (last byte + 1)
- */
-
-static inline u64 ceph_decode_64(void **p)
-{
-	u64 v = get_unaligned_le64(*p);
-	*p += sizeof(u64);
-	return v;
-}
-static inline u32 ceph_decode_32(void **p)
-{
-	u32 v = get_unaligned_le32(*p);
-	*p += sizeof(u32);
-	return v;
-}
-static inline u16 ceph_decode_16(void **p)
-{
-	u16 v = get_unaligned_le16(*p);
-	*p += sizeof(u16);
-	return v;
-}
-static inline u8 ceph_decode_8(void **p)
-{
-	u8 v = *(u8 *)*p;
-	(*p)++;
-	return v;
-}
-static inline void ceph_decode_copy(void **p, void *pv, size_t n)
-{
-	memcpy(pv, *p, n);
-	*p += n;
-}
-
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad)		\
-	do {						\
-		if (unlikely(*(p) + (n) > (end))) 	\
-			goto bad;			\
-	} while (0)
-
-#define ceph_decode_64_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u64), bad);	\
-		v = ceph_decode_64(p);				\
-	} while (0)
-#define ceph_decode_32_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u32), bad);	\
-		v = ceph_decode_32(p);				\
-	} while (0)
-#define ceph_decode_16_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u16), bad);	\
-		v = ceph_decode_16(p);				\
-	} while (0)
-#define ceph_decode_8_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u8), bad);	\
-		v = ceph_decode_8(p);				\
-	} while (0)
-
-#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
-	do {							\
-		ceph_decode_need(p, end, n, bad);		\
-		ceph_decode_copy(p, pv, n);			\
-	} while (0)
-
-/*
- * struct ceph_timespec <-> struct timespec
- */
-static inline void ceph_decode_timespec(struct timespec *ts,
-					const struct ceph_timespec *tv)
-{
-	ts->tv_sec = le32_to_cpu(tv->tv_sec);
-	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
-}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
-					const struct timespec *ts)
-{
-	tv->tv_sec = cpu_to_le32(ts->tv_sec);
-	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
-}
-
-/*
- * sockaddr_storage <-> ceph_sockaddr
- */
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
-{
-	__be16 ss_family = htons(a->in_addr.ss_family);
-	a->in_addr.ss_family = *(__u16 *)&ss_family;
-}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
-{
-	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
-	a->in_addr.ss_family = ntohs(ss_family);
-	WARN_ON(a->in_addr.ss_family == 512);
-}
-
-/*
- * encoders
- */
-static inline void ceph_encode_64(void **p, u64 v)
-{
-	put_unaligned_le64(v, (__le64 *)*p);
-	*p += sizeof(u64);
-}
-static inline void ceph_encode_32(void **p, u32 v)
-{
-	put_unaligned_le32(v, (__le32 *)*p);
-	*p += sizeof(u32);
-}
-static inline void ceph_encode_16(void **p, u16 v)
-{
-	put_unaligned_le16(v, (__le16 *)*p);
-	*p += sizeof(u16);
-}
-static inline void ceph_encode_8(void **p, u8 v)
-{
-	*(u8 *)*p = v;
-	(*p)++;
-}
-static inline void ceph_encode_copy(void **p, const void *s, int len)
-{
-	memcpy(*p, s, len);
-	*p += len;
-}
-
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
-					u64 ino, const char *path)
-{
-	u32 len = path ? strlen(path) : 0;
-	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
-	ceph_encode_8(p, 1);
-	ceph_encode_64(p, ino);
-	ceph_encode_32(p, len);
-	if (len)
-		memcpy(*p, path, len);
-	*p += len;
-}
-
-static inline void ceph_encode_string(void **p, void *end,
-				      const char *s, u32 len)
-{
-	BUG_ON(*p + sizeof(len) + len > end);
-	ceph_encode_32(p, len);
-	if (len)
-		memcpy(*p, s, len);
-	*p += len;
-}
-
-#define ceph_encode_need(p, end, n, bad)		\
-	do {						\
-		if (unlikely(*(p) + (n) > (end))) 	\
-			goto bad;			\
-	} while (0)
-
-#define ceph_encode_64_safe(p, end, v, bad)			\
-	do {							\
-		ceph_encode_need(p, end, sizeof(u64), bad);	\
-		ceph_encode_64(p, v);				\
-	} while (0)
-#define ceph_encode_32_safe(p, end, v, bad)			\
-	do {							\
-		ceph_encode_need(p, end, sizeof(u32), bad);	\
-		ceph_encode_32(p, v);			\
-	} while (0)
-#define ceph_encode_16_safe(p, end, v, bad)			\
-	do {							\
-		ceph_encode_need(p, end, sizeof(u16), bad);	\
-		ceph_encode_16(p, v);			\
-	} while (0)
-
-#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
-	do {							\
-		ceph_encode_need(p, end, n, bad);		\
-		ceph_encode_copy(p, pv, n);			\
-	} while (0)
-#define ceph_encode_string_safe(p, end, s, n, bad)		\
-	do {							\
-		ceph_encode_need(p, end, n, bad);		\
-		ceph_encode_string(p, end, s, n);		\
-	} while (0)
-
-
-#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..a8d2aacc612b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 /*
  * Directory operations: readdir, lookup, create, link, unlink,
@@ -227,15 +228,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct ceph_file_info *fi = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned frag = fpos_frag(filp->f_pos);
 	int off = fpos_off(filp->f_pos);
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
-	const int max_entries = client->mount_args->max_readdir;
-	const int max_bytes = client->mount_args->max_readdir_bytes;
+	const int max_entries = fsc->mount_options->max_readdir;
+	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
 	if (fi->at_end)
@@ -267,7 +268,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* can we use the dcache? */
 	spin_lock(&inode->i_lock);
 	if ((filp->f_pos == 2 || fi->dentry) &&
-	    !ceph_test_opt(client, NOASYNCREADDIR) &&
+	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
 	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
@@ -487,14 +488,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				  struct dentry *dentry, int err)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *parent = dentry->d_parent->d_inode;
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
 	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
 	    strcmp(dentry->d_name.name,
-		   client->mount_args->snapdir_name) == 0) {
+		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
 		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
 		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +540,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 				  struct nameidata *nd)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int op;
 	int err;
@@ -572,7 +573,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		spin_lock(&dir->i_lock);
 		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
-			    client->mount_args->snapdir_name,
+			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
 		    !is_root_ceph_dentry(dir, dentry) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +630,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 		      int mode, dev_t rdev)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -685,8 +686,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 			    const char *dest)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -716,8 +717,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 
 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err = -EROFS;
 	int op;
@@ -758,8 +759,8 @@ out:
 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 		     struct dentry *dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -813,8 +814,8 @@ static int drop_caps_for_unlink(struct inode *inode)
  */
 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct ceph_mds_request *req;
 	int err = -EROFS;
@@ -854,8 +855,8 @@ out:
 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		       struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -1076,7 +1077,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
 
-	if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!cf->dir_info) {
@@ -1177,7 +1178,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_add_tail(&di->lru, &mdsc->dentry_lru);
 		mdsc->num_dentry++;
@@ -1193,7 +1194,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
 	     dn->d_name.len, dn->d_name.name, di->offset);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_move_tail(&di->lru, &mdsc->dentry_lru);
 		spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1209,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_del_init(&di->lru);
 		mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2e..2297d9426992 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 /*
  * NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
 				      struct ceph_nfs_confh *cfh)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
 	struct dentry *dentry;
 	struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..e77c28cf3690 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -38,8 +39,8 @@
 static struct ceph_mds_request *
 prepare_open_request(struct super_block *sb, int flags, int create_mode)
 {
-	struct ceph_client *client = ceph_sb_to_client(sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int want_auth = USE_ANY_MDS;
 	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 int ceph_open(struct inode *inode, struct file *file)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
 	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd, int mode,
 				int locked_dir)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct file *file = nd->intent.open.file;
 	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
 	struct ceph_mds_request *req;
@@ -269,163 +270,6 @@ int ceph_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-/*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-					    int num_pages,
-					    loff_t off, size_t len)
-{
-	struct page **pages;
-	int rc;
-
-	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	down_read(&current->mm->mmap_sem);
-	rc = get_user_pages(current, current->mm, (unsigned long)data,
-			    num_pages, 0, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-	if (rc < 0)
-		goto fail;
-	return pages;
-
-fail:
-	kfree(pages);
-	return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		put_page(pages[i]);
-	kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		__free_pages(pages[i], 0);
-	kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-	struct page **pages;
-	int i;
-
-	pages = kmalloc(sizeof(*pages) * num_pages, flags);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < num_pages; i++) {
-		pages[i] = __page_cache_alloc(flags);
-		if (pages[i] == NULL) {
-			ceph_release_page_vector(pages, i);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-	return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-				    const char __user *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, PAGE_CACHE_SIZE-po, left);
-		bad = copy_from_user(page_address(pages[i]) + po, data, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		po += l - bad;
-		if (po == PAGE_CACHE_SIZE) {
-			po = 0;
-			i++;
-		}
-	}
-	return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, left, PAGE_CACHE_SIZE-po);
-		bad = copy_to_user(data, page_address(pages[i]) + po, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		if (po) {
-			po += l - bad;
-			if (po == PAGE_CACHE_SIZE)
-				po = 0;
-		}
-		i++;
-	}
-	return len;
-}
-
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-	int i = off >> PAGE_CACHE_SHIFT;
-
-	off &= ~PAGE_CACHE_MASK;
-
-	dout("zero_page_vector_page %u~%u\n", off, len);
-
-	/* leading partial page? */
-	if (off) {
-		int end = min((int)PAGE_CACHE_SIZE, off + len);
-		dout("zeroing %d %p head from %d\n", i, pages[i],
-		     (int)off);
-		zero_user_segment(pages[i], off, end);
-		len -= (end - off);
-		i++;
-	}
-	while (len >= PAGE_CACHE_SIZE) {
-		dout("zeroing %d %p len=%d\n", i, pages[i], len);
-		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-		i++;
-	}
-	/* trailing partial page? */
-	if (len) {
-		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-		zero_user_segment(pages[i], 0, len);
-	}
-}
-
-
 /*
  * Read a range of bytes striped over one or more objects.  Iterate over
  * objects we stripe over.  (That's not atomic, but good enough for now.)
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
 			struct page **pages, int num_pages,
 			int *checkeof)
 {
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 pos, this_len;
 	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
 
 more:
 	this_len = left;
-	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
 				  &ci->i_layout, pos, &this_len,
 				  ci->i_truncate_seq,
 				  ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
 
 		if (read < pos - off) {
 			dout(" zero gap %llu to %llu\n", off + read, pos);
-			zero_page_vector_range(page_off + read,
-					       pos - off - read, pages);
+			ceph_zero_page_vector_range(page_off + read,
+						    pos - off - read, pages);
 		}
 		pos += ret;
 		read = pos - off;
@@ -495,8 +339,8 @@ more:
 		/* was original extent fully inside i_size? */
 		if (pos + left <= inode->i_size) {
 			dout("zero tail\n");
-			zero_page_vector_range(page_off + read, len - read,
-					       pages);
+			ceph_zero_page_vector_range(page_off + read, len - read,
+						    pages);
 			read = len;
 			goto out;
 		}
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
 
 	if (file->f_flags & O_DIRECT) {
-		pages = get_direct_page_vector(data, num_pages, off, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, off, len);
 
 		/*
 		 * flush any page cache pages in this range.  this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
 
 	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-		ret = copy_page_vector_to_user(pages, data, off, ret);
+		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
 	if (ret >= 0)
 		*poff = off + ret;
 
 done:
 	if (file->f_flags & O_DIRECT)
-		put_page_vector(pages, num_pages);
+		ceph_put_page_vector(pages, num_pages);
 	else
 		ceph_release_page_vector(pages, num_pages);
 	dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_osd_request *req;
 	struct page **pages;
 	int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	 */
 more:
 	len = left;
-	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 				    ceph_vino(inode), pos, &len,
 				    CEPH_OSD_OP_WRITE, flags,
 				    ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
 	num_pages = calc_pages_for(pos, len);
 
 	if (file->f_flags & O_DIRECT) {
-		pages = get_direct_page_vector(data, num_pages, pos, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
@@ -673,7 +517,7 @@ more:
 			ret = PTR_ERR(pages);
 			goto out;
 		}
-		ret = copy_user_to_page_vector(pages, data, pos, len);
+		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
 		if (ret < 0) {
 			ceph_release_page_vector(pages, num_pages);
 			goto out;
@@ -689,7 +533,7 @@ more:
 	req->r_num_pages = num_pages;
 	req->r_inode = inode;
 
-	ret = ceph_osdc_start_request(&client->osdc, req, false);
+	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		if (req->r_safe_callback) {
 			/*
@@ -701,11 +545,11 @@ more:
 			spin_unlock(&ci->i_unsafe_lock);
 			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
 		}
-		ret = ceph_osdc_wait_request(&client->osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	}
 
 	if (file->f_flags & O_DIRECT)
-		put_page_vector(pages, num_pages);
+		ceph_put_page_vector(pages, num_pages);
 	else if (file->f_flags & O_SYNC)
 		ceph_release_page_vector(pages, num_pages);
 
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	loff_t endoff = pos + iov->iov_len;
 	int want, got = 0;
 	int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..1d6a45b5a04c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/module.h>
 #include <linux/fs.h>
@@ -13,7 +13,8 @@
 #include <linux/pagevec.h>
 
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 
 /*
  * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
 
 		dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
 		}
 
 		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
 			inode->i_size = ci->i_rbytes;
 		break;
 	default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	struct inode *in = NULL;
 	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
-	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int i = 0;
 	int err = 0;
 
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	 */
 	if (rinfo->head->is_dentry && !req->r_aborted &&
 	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-					       client->mount_args->snapdir_name,
+					       fsc->mount_options->snapdir_name,
 					       req->r_dentry->d_name.len))) {
 		/*
 		 * lookup link rename   : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
@@ -1728,8 +1729,8 @@ out:
  */
 int ceph_do_getattr(struct inode *inode, int mask)
 {
-	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..899578b0c46b 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
 #include <linux/in.h>
 
-#include "ioctl.h"
 #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
 
 
 /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
 	int err, i;
@@ -98,7 +100,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct ceph_ioctl_dataloc dl;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..087afe9ca367 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/file.h>
 #include <linux/namei.h>
 
 #include "super.h"
 #include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
 
 /**
  * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(inode->i_sb)->mdsc;
+		ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..33568239a08e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,20 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
-#include "mds_client.h"
-#include "mon_client.h"
 #include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +289,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_authorizer)
-			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
-				s->s_mdsc->client->monc.auth, s->s_authorizer);
+		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+			     s->s_mdsc->fsc->client->monc.auth,
+			     s->s_authorizer);
 		kfree(s);
 	}
 }
@@ -344,7 +348,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(mdsc->client->msgr, &s->s_con);
+	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
 	s->s_con.private = s;
 	s->s_con.ops = &mds_con_ops;
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +603,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	} else if (req->r_dentry) {
 		struct inode *dir = req->r_dentry->d_parent->d_inode;
 
-		if (dir->i_sb != mdsc->client->sb) {
+		if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
 			inode = req->r_dentry->d_inode;
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +888,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	__ceph_remove_cap(cap);
 	if (!__ceph_is_any_real_caps(ci)) {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&mdsc->cap_dirty_lock);
 		if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1150,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 	struct ceph_msg *msg, *partial = NULL;
 	struct ceph_mds_cap_release *head;
 	int err = -ENOMEM;
-	int extra = mdsc->client->mount_args->cap_release_safety;
+	int extra = mdsc->fsc->mount_options->cap_release_safety;
 	int num;
 
 	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2089,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	/* insert trace into our cache */
 	mutex_lock(&req->r_fill_mutex);
-	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
@@ -2613,7 +2617,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session,
 			 struct ceph_msg *msg)
 {
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct dentry *parent, *dentry;
@@ -2891,10 +2895,16 @@ static void delayed_work(struct work_struct *work)
 	schedule_delayed(mdsc);
 }
 
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
 
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 {
-	mdsc->client = client;
+	struct ceph_mds_client *mdsc;
+
+	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+	if (!mdsc)
+		return -ENOMEM;
+	mdsc->fsc = fsc;
+	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
 	if (mdsc->mdsmap == NULL)
@@ -2927,7 +2937,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
 
 	ceph_caps_init(mdsc);
-	ceph_adjust_min_caps(mdsc, client->min_caps);
+	ceph_adjust_min_caps(mdsc, fsc->min_caps);
 
 	return 0;
 }
@@ -2939,7 +2949,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_request *req;
-	struct ceph_client *client = mdsc->client;
+	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2957,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    client->mount_args->mount_timeout * HZ);
+				    fsc->client->options->mount_timeout * HZ);
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -3030,7 +3040,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
-	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
@@ -3053,7 +3063,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
 	int i, n = 0;
 
-	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return true;
 
 	mutex_lock(&mdsc->mutex);
@@ -3071,8 +3081,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_session *session;
 	int i;
-	struct ceph_client *client = mdsc->client;
-	unsigned long timeout = client->mount_args->mount_timeout * HZ;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
@@ -3119,7 +3129,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	dout("stopped\n");
 }
 
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3139,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 	ceph_caps_finalize(mdsc);
 }
 
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	ceph_mdsc_stop(mdsc);
+	fsc->mdsc = NULL;
+	kfree(mdsc);
+}
+
 
 /*
  * handle mds map update.
@@ -3145,14 +3164,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
 		return;
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
 
 	/* do we need it? */
-	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+	ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
 	mutex_lock(&mdsc->mutex);
 	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
 		dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3195,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	} else {
 		mdsc->mdsmap = newmap;  /* first mds map */
 	}
-	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
 
 	__wake_requests(mdsc, &mdsc->waiting_for_map);
 
@@ -3277,7 +3296,7 @@ static int get_authorizer(struct ceph_connection *con,
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	int ret = 0;
 
 	if (force_new && s->s_authorizer) {
@@ -3311,7 +3330,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
 	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
@@ -3320,12 +3339,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
 	if (ac->ops->invalidate_authorizer)
 		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 
-	return ceph_monc_validate_auth(&mdsc->client->monc);
+	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
 static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3357,4 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.peer_reset = peer_reset,
 };
 
-
-
-
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
 
 /*
  * Some lock dependencies:
@@ -26,7 +26,7 @@
  *
  */
 
-struct ceph_client;
+struct ceph_fs_client;
 struct ceph_cap;
 
 /*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
  * mds client state
  */
 struct ceph_mds_client {
-	struct ceph_client      *client;
+	struct ceph_fs_client  *fsc;
 	struct mutex            mutex;         /* all nested structures */
 
 	struct ceph_mdsmap      *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
 	int		caps_avail_count;    /* unused, unreserved */
 	int		caps_min_count;      /* keep at least this many
 						(unreserved) */
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry 	  *debugfs_file;
-#endif
-
 	spinlock_t	  dentry_lru_lock;
 	struct list_head  dentry_lru;
 	int		  num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 			     struct ceph_msg *msg, int mds);
 
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
-			   struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/bug.h>
 #include <linux/err.h>
@@ -6,9 +6,9 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
 
 #include "super.h"
 
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		}
 
 		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+		     i+1, n, global_id, mds, inc,
+		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
 		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
 			m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include "types.h"
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-	u64 global_id;
-	struct ceph_entity_addr addr;
-	s32 state;
-	int num_export_targets;
-	bool laggy;
-	u32 *export_targets;
-};
-
-struct ceph_mdsmap {
-	u32 m_epoch, m_client_epoch, m_last_failure;
-	u32 m_root;
-	u32 m_session_timeout;          /* seconds */
-	u32 m_session_autoclose;        /* seconds */
-	u64 m_max_file_size;
-	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-	struct ceph_mds_info *m_info;
-
-	/* which object pools file data can be stored in */
-	int m_num_data_pg_pools;
-	u32 *m_data_pg_pools;
-	u32 m_cas_pg_pool;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-	if (w >= m->m_max_mds)
-		return NULL;
-	return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-	BUG_ON(w < 0);
-	if (w >= m->m_max_mds)
-		return CEPH_MDS_STATE_DNE;
-	return m->m_info[w].state;
-}
-
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
-	if (w >= 0 && w < m->m_max_mds)
-		return m->m_info[w].laggy;
-	return false;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-
-#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 17a09b32a591..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2432 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <net/tcp.h>
-
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
-
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system.  The messenger provides ordered and reliable
- * delivery.  We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error).  Acks allow sent messages to be discarded by
- * the sender.
- */
-
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-
-#ifdef CONFIG_LOCKDEP
-static struct lock_class_key socket_class;
-#endif
-
-
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
-	int i;
-	char *s;
-	struct sockaddr_in *in4 = (void *)ss;
-	struct sockaddr_in6 *in6 = (void *)ss;
-
-	spin_lock(&addr_str_lock);
-	i = last_addr_str++;
-	if (last_addr_str == MAX_ADDR_STR)
-		last_addr_str = 0;
-	spin_unlock(&addr_str_lock);
-	s = addr_str[i];
-
-	switch (ss->ss_family) {
-	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-			 (unsigned int)ntohs(in4->sin_port));
-		break;
-
-	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-			 (unsigned int)ntohs(in6->sin6_port));
-		break;
-
-	default:
-		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
-	}
-
-	return s;
-}
-
-static void encode_my_addr(struct ceph_messenger *msgr)
-{
-	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_addr(&msgr->my_enc_addr);
-}
-
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-
-int __init ceph_msgr_init(void)
-{
-	ceph_msgr_wq = create_workqueue("ceph-msgr");
-	if (IS_ERR(ceph_msgr_wq)) {
-		int ret = PTR_ERR(ceph_msgr_wq);
-		pr_err("msgr_init failed to create workqueue: %d\n", ret);
-		ceph_msgr_wq = NULL;
-		return ret;
-	}
-	return 0;
-}
-
-void ceph_msgr_exit(void)
-{
-	destroy_workqueue(ceph_msgr_wq);
-}
-
-void ceph_msgr_flush(void)
-{
-	flush_workqueue(ceph_msgr_wq);
-}
-
-
-/*
- * socket callback functions
- */
-
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
-	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("ceph_data_ready on %p state = %lu, queueing work\n",
-		     con, con->state);
-		queue_con(con);
-	}
-}
-
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
-
-	/* only queue to workqueue if there is data we want to write. */
-	if (test_bit(WRITE_PENDING, &con->state)) {
-		dout("ceph_write_space %p queueing write work\n", con);
-		queue_con(con);
-	} else {
-		dout("ceph_write_space %p nothing to write\n", con);
-	}
-
-	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
-	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
-
-	dout("ceph_state_change %p state = %lu sk_state = %u\n",
-	     con, con->state, sk->sk_state);
-
-	if (test_bit(CLOSED, &con->state))
-		return;
-
-	switch (sk->sk_state) {
-	case TCP_CLOSE:
-		dout("ceph_state_change TCP_CLOSE\n");
-	case TCP_CLOSE_WAIT:
-		dout("ceph_state_change TCP_CLOSE_WAIT\n");
-		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
-			if (test_bit(CONNECTING, &con->state))
-				con->error_msg = "connection failed";
-			else
-				con->error_msg = "socket closed";
-			queue_con(con);
-		}
-		break;
-	case TCP_ESTABLISHED:
-		dout("ceph_state_change TCP_ESTABLISHED\n");
-		queue_con(con);
-		break;
-	}
-}
-
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
-			       struct ceph_connection *con)
-{
-	struct sock *sk = sock->sk;
-	sk->sk_user_data = (void *)con;
-	sk->sk_data_ready = ceph_data_ready;
-	sk->sk_write_space = ceph_write_space;
-	sk->sk_state_change = ceph_state_change;
-}
-
-
-/*
- * socket helpers
- */
-
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
-	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
-	struct socket *sock;
-	int ret;
-
-	BUG_ON(con->sock);
-	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
-			       IPPROTO_TCP, &sock);
-	if (ret)
-		return ERR_PTR(ret);
-	con->sock = sock;
-	sock->sk->sk_allocation = GFP_NOFS;
-
-#ifdef CONFIG_LOCKDEP
-	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
-#endif
-
-	set_sock_callbacks(sock, con);
-
-	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-
-	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
-				 O_NONBLOCK);
-	if (ret == -EINPROGRESS) {
-		dout("connect %s EINPROGRESS sk_state = %u\n",
-		     pr_addr(&con->peer_addr.in_addr),
-		     sock->sk->sk_state);
-		ret = 0;
-	}
-	if (ret < 0) {
-		pr_err("connect %s error %d\n",
-		       pr_addr(&con->peer_addr.in_addr), ret);
-		sock_release(sock);
-		con->sock = NULL;
-		con->error_msg = "connect error";
-	}
-
-	if (ret < 0)
-		return ERR_PTR(ret);
-	return sock;
-}
-
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-	struct kvec iov = {buf, len};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
-	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-		     size_t kvlen, size_t len, int more)
-{
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
-	if (more)
-		msg.msg_flags |= MSG_MORE;
-	else
-		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-
-
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
-	int rc;
-
-	dout("con_close_socket on %p sock %p\n", con, con->sock);
-	if (!con->sock)
-		return 0;
-	set_bit(SOCK_CLOSED, &con->state);
-	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
-	sock_release(con->sock);
-	con->sock = NULL;
-	clear_bit(SOCK_CLOSED, &con->state);
-	return rc;
-}
-
-/*
- * Reset a connection.  Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
-	list_del_init(&msg->list_head);
-	ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
-	while (!list_empty(head)) {
-		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
-							list_head);
-		ceph_msg_remove(msg);
-	}
-}
-
-static void reset_connection(struct ceph_connection *con)
-{
-	/* reset connection, out_queue, msg_ and connect_seq */
-	/* discard existing out_queue and msg_seq */
-	ceph_msg_remove_list(&con->out_queue);
-	ceph_msg_remove_list(&con->out_sent);
-
-	if (con->in_msg) {
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-
-	con->connect_seq = 0;
-	con->out_seq = 0;
-	if (con->out_msg) {
-		ceph_msg_put(con->out_msg);
-		con->out_msg = NULL;
-	}
-	con->out_keepalive_pending = false;
-	con->in_seq = 0;
-	con->in_seq_acked = 0;
-}
-
-/*
- * mark a peer down.  drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
-	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
-	set_bit(CLOSED, &con->state);  /* in case there's queued work */
-	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
-	clear_bit(KEEPALIVE_PENDING, &con->state);
-	clear_bit(WRITE_PENDING, &con->state);
-	mutex_lock(&con->mutex);
-	reset_connection(con);
-	con->peer_global_seq = 0;
-	cancel_delayed_work(&con->work);
-	mutex_unlock(&con->mutex);
-	queue_con(con);
-}
-
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
-	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
-	set_bit(OPENING, &con->state);
-	clear_bit(CLOSED, &con->state);
-	memcpy(&con->peer_addr, addr, sizeof(*addr));
-	con->delay = 0;      /* reset backoff memory */
-	queue_con(con);
-}
-
-/*
- * return true if this connection ever successfully opened
- */
-bool ceph_con_opened(struct ceph_connection *con)
-{
-	return con->connect_seq > 0;
-}
-
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
-	dout("con_get %p nref = %d -> %d\n", con,
-	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
-	if (atomic_inc_not_zero(&con->nref))
-		return con;
-	return NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
-	dout("con_put %p nref = %d -> %d\n", con,
-	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
-	BUG_ON(atomic_read(&con->nref) == 0);
-	if (atomic_dec_and_test(&con->nref)) {
-		BUG_ON(con->sock);
-		kfree(con);
-	}
-}
-
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
-	dout("con_init %p\n", con);
-	memset(con, 0, sizeof(*con));
-	atomic_set(&con->nref, 1);
-	con->msgr = msgr;
-	mutex_init(&con->mutex);
-	INIT_LIST_HEAD(&con->out_queue);
-	INIT_LIST_HEAD(&con->out_sent);
-	INIT_DELAYED_WORK(&con->work, con_work);
-}
-
-
-/*
- * We maintain a global counter to order connection attempts.  Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
-	u32 ret;
-
-	spin_lock(&msgr->global_seq_lock);
-	if (msgr->global_seq < gt)
-		msgr->global_seq = gt;
-	ret = ++msgr->global_seq;
-	spin_unlock(&msgr->global_seq_lock);
-	return ret;
-}
-
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
-	struct ceph_msg *m = con->out_msg;
-
-	dout("prepare_write_message_footer %p\n", con);
-	con->out_kvec_is_msg = true;
-	con->out_kvec[v].iov_base = &m->footer;
-	con->out_kvec[v].iov_len = sizeof(m->footer);
-	con->out_kvec_bytes += sizeof(m->footer);
-	con->out_kvec_left++;
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	int v = 0;
-
-	con->out_kvec_bytes = 0;
-	con->out_kvec_is_msg = true;
-	con->out_msg_done = false;
-
-	/* Sneak an ack in there first?  If we can get it into the same
-	 * TCP packet that's a good thing. */
-	if (con->in_seq > con->in_seq_acked) {
-		con->in_seq_acked = con->in_seq;
-		con->out_kvec[v].iov_base = &tag_ack;
-		con->out_kvec[v++].iov_len = 1;
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con->out_kvec[v].iov_base = &con->out_temp_ack;
-		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
-		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-	}
-
-	m = list_first_entry(&con->out_queue,
-		       struct ceph_msg, list_head);
-	con->out_msg = m;
-	if (test_bit(LOSSYTX, &con->state)) {
-		list_del_init(&m->list_head);
-	} else {
-		/* put message on sent list */
-		ceph_msg_get(m);
-		list_move_tail(&m->list_head, &con->out_sent);
-	}
-
-	/*
-	 * only assign outgoing seq # if we haven't sent this message
-	 * yet.  if it is requeued, resend with it's original seq.
-	 */
-	if (m->needs_out_seq) {
-		m->hdr.seq = cpu_to_le64(++con->out_seq);
-		m->needs_out_seq = false;
-	}
-
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
-	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     le32_to_cpu(m->hdr.data_len),
-	     m->nr_pages);
-	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-
-	/* tag + hdr + front + middle */
-	con->out_kvec[v].iov_base = &tag_msg;
-	con->out_kvec[v++].iov_len = 1;
-	con->out_kvec[v].iov_base = &m->hdr;
-	con->out_kvec[v++].iov_len = sizeof(m->hdr);
-	con->out_kvec[v++] = m->front;
-	if (m->middle)
-		con->out_kvec[v++] = m->middle->vec;
-	con->out_kvec_left = v;
-	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
-		(m->middle ? m->middle->vec.iov_len : 0);
-	con->out_kvec_cur = con->out_kvec;
-
-	/* fill in crc (except data pages), footer */
-	con->out_msg->hdr.crc =
-		cpu_to_le32(crc32c(0, (void *)&m->hdr,
-				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
-	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
-	con->out_msg->footer.front_crc =
-		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
-	if (m->middle)
-		con->out_msg->footer.middle_crc =
-			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
-					   m->middle->vec.iov_len));
-	else
-		con->out_msg->footer.middle_crc = 0;
-	con->out_msg->footer.data_crc = 0;
-	dout("prepare_write_message front_crc %u data_crc %u\n",
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-
-	/* is there a data payload? */
-	if (le32_to_cpu(m->hdr.data_len) > 0) {
-		/* initialize page iterator */
-		con->out_msg_pos.page = 0;
-		if (m->pages)
-			con->out_msg_pos.page_pos =
-				le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
-		else
-			con->out_msg_pos.page_pos = 0;
-		con->out_msg_pos.data_pos = 0;
-		con->out_msg_pos.did_page_crc = 0;
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
-		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con, v);
-	}
-
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-	dout("prepare_write_ack %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con->out_kvec[0].iov_base = &tag_ack;
-	con->out_kvec[0].iov_len = 1;
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con->out_kvec[1].iov_base = &con->out_temp_ack;
-	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
-	con->out_kvec_left = 2;
-	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 1;  /* more will follow.. eventually.. */
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-	dout("prepare_write_keepalive %p\n", con);
-	con->out_kvec[0].iov_base = &tag_keepalive;
-	con->out_kvec[0].iov_len = 1;
-	con->out_kvec_left = 1;
-	con->out_kvec_bytes = 1;
-	con->out_kvec_cur = con->out_kvec;
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Connection negotiation.
- */
-
-static void prepare_connect_authorizer(struct ceph_connection *con)
-{
-	void *auth_buf;
-	int auth_len = 0;
-	int auth_protocol = 0;
-
-	mutex_unlock(&con->mutex);
-	if (con->ops->get_authorizer)
-		con->ops->get_authorizer(con, &auth_buf, &auth_len,
-					 &auth_protocol, &con->auth_reply_buf,
-					 &con->auth_reply_buf_len,
-					 con->auth_retry);
-	mutex_lock(&con->mutex);
-
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
-	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-
-	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
-	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += auth_len;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_messenger *msgr,
-				 struct ceph_connection *con)
-{
-	int len = strlen(CEPH_BANNER);
-
-	con->out_kvec[0].iov_base = CEPH_BANNER;
-	con->out_kvec[0].iov_len = len;
-	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-	con->out_kvec_left = 2;
-	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-static void prepare_write_connect(struct ceph_messenger *msgr,
-				  struct ceph_connection *con,
-				  int after_banner)
-{
-	unsigned global_seq = get_global_seq(con->msgr, 0);
-	int proto;
-
-	switch (con->peer_name.type) {
-	case CEPH_ENTITY_TYPE_MON:
-		proto = CEPH_MONC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_OSD:
-		proto = CEPH_OSDC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_MDS:
-		proto = CEPH_MDSC_PROTOCOL;
-		break;
-	default:
-		BUG();
-	}
-
-	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
-
-	con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
-
-	if (!after_banner) {
-		con->out_kvec_left = 0;
-		con->out_kvec_bytes = 0;
-	}
-	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
-	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
-	con->out_kvec_left++;
-	con->out_kvec_bytes += sizeof(con->out_connect);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
-
-	prepare_connect_authorizer(con);
-}
-
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-	int ret;
-
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
-		if (ret <= 0)
-			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
-			break;            /* done */
-		while (ret > 0) {
-			if (ret >= con->out_kvec_cur->iov_len) {
-				ret -= con->out_kvec_cur->iov_len;
-				con->out_kvec_cur++;
-				con->out_kvec_left--;
-			} else {
-				con->out_kvec_cur->iov_len -= ret;
-				con->out_kvec_cur->iov_base += ret;
-				ret = 0;
-				break;
-			}
-		}
-	}
-	con->out_kvec_left = 0;
-	con->out_kvec_is_msg = false;
-	ret = 1;
-out:
-	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
-	return ret;  /* done! */
-}
-
-#ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
-{
-	if (!bio) {
-		*iter = NULL;
-		*seg = 0;
-		return;
-	}
-	*iter = bio;
-	*seg = bio->bi_idx;
-}
-
-static void iter_bio_next(struct bio **bio_iter, int *seg)
-{
-	if (*bio_iter == NULL)
-		return;
-
-	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
-
-	(*seg)++;
-	if (*seg == (*bio_iter)->bi_vcnt)
-		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
-}
-#endif
-
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
-	size_t len;
-	int crc = con->msgr->nocrc;
-	int ret;
-	int total_max_write;
-	int in_trail = 0;
-	size_t trail_len = (msg->trail ? msg->trail->length : 0);
-
-	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
-	     con->out_msg_pos.page_pos);
-
-#ifdef CONFIG_BLOCK
-	if (msg->bio && !msg->bio_iter)
-		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
-#endif
-
-	while (data_len > con->out_msg_pos.data_pos) {
-		struct page *page = NULL;
-		void *kaddr = NULL;
-		int max_write = PAGE_SIZE;
-		int page_shift = 0;
-
-		total_max_write = data_len - trail_len -
-			con->out_msg_pos.data_pos;
-
-		/*
-		 * if we are calculating the data crc (the default), we need
-		 * to map the page.  if our pages[] has been revoked, use the
-		 * zero page.
-		 */
-
-		/* have we reached the trail part of the data? */
-		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
-			in_trail = 1;
-
-			total_max_write = data_len - con->out_msg_pos.data_pos;
-
-			page = list_first_entry(&msg->trail->head,
-						struct page, lru);
-			if (crc)
-				kaddr = kmap(page);
-			max_write = PAGE_SIZE;
-		} else if (msg->pages) {
-			page = msg->pages[con->out_msg_pos.page];
-			if (crc)
-				kaddr = kmap(page);
-		} else if (msg->pagelist) {
-			page = list_first_entry(&msg->pagelist->head,
-						struct page, lru);
-			if (crc)
-				kaddr = kmap(page);
-#ifdef CONFIG_BLOCK
-		} else if (msg->bio) {
-			struct bio_vec *bv;
-
-			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
-			page = bv->bv_page;
-			page_shift = bv->bv_offset;
-			if (crc)
-				kaddr = kmap(page) + page_shift;
-			max_write = bv->bv_len;
-#endif
-		} else {
-			page = con->msgr->zero_page;
-			if (crc)
-				kaddr = page_address(con->msgr->zero_page);
-		}
-		len = min_t(int, max_write - con->out_msg_pos.page_pos,
-			    total_max_write);
-
-		if (crc && !con->out_msg_pos.did_page_crc) {
-			void *base = kaddr + con->out_msg_pos.page_pos;
-			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-
-			BUG_ON(kaddr == NULL);
-			con->out_msg->footer.data_crc =
-				cpu_to_le32(crc32c(tmpcrc, base, len));
-			con->out_msg_pos.did_page_crc = 1;
-		}
-		ret = kernel_sendpage(con->sock, page,
-				      con->out_msg_pos.page_pos + page_shift,
-				      len,
-				      MSG_DONTWAIT | MSG_NOSIGNAL |
-				      MSG_MORE);
-
-		if (crc &&
-		    (msg->pages || msg->pagelist || msg->bio || in_trail))
-			kunmap(page);
-
-		if (ret <= 0)
-			goto out;
-
-		con->out_msg_pos.data_pos += ret;
-		con->out_msg_pos.page_pos += ret;
-		if (ret == len) {
-			con->out_msg_pos.page_pos = 0;
-			con->out_msg_pos.page++;
-			con->out_msg_pos.did_page_crc = 0;
-			if (in_trail)
-				list_move_tail(&page->lru,
-					       &msg->trail->head);
-			else if (msg->pagelist)
-				list_move_tail(&page->lru,
-					       &msg->pagelist->head);
-#ifdef CONFIG_BLOCK
-			else if (msg->bio)
-				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
-#endif
-		}
-	}
-
-	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-
-	/* prepare and queue up footer, too */
-	if (!crc)
-		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_left = 0;
-	con->out_kvec_cur = con->out_kvec;
-	prepare_write_message_footer(con, 0);
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-	int ret;
-
-	while (con->out_skip > 0) {
-		struct kvec iov = {
-			.iov_base = page_address(con->msgr->zero_page),
-			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
-		};
-
-		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
-		if (ret <= 0)
-			goto out;
-		con->out_skip -= ret;
-	}
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-	dout("prepare_read_message %p\n", con);
-	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
-	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-	return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-			int *to, int size, void *object)
-{
-	*to += size;
-	while (con->in_base_pos < *to) {
-		int left = *to - con->in_base_pos;
-		int have = size - left;
-		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-	int ret, to = 0;
-
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-	/* peer's banner */
-	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
-	if (ret <= 0)
-		goto out;
-	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
-			   &con->actual_peer_addr);
-	if (ret <= 0)
-		goto out;
-	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
-			   &con->peer_addr_for_me);
-	if (ret <= 0)
-		goto out;
-out:
-	return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-	int ret, to = 0;
-
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
-	if (ret <= 0)
-		goto out;
-	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
-			   con->auth_reply_buf);
-	if (ret <= 0)
-		goto out;
-
-	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
-out:
-	return ret;
-
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to %s got bad banner\n",
-		       pr_addr(&con->peer_addr.in_addr));
-		con->error_msg = "protocol error, bad banner";
-		return -1;
-	}
-	return 0;
-}
-
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
-	switch (ss->ss_family) {
-	case AF_INET:
-		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
-	case AF_INET6:
-		return
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
-	}
-	return false;
-}
-
-static int addr_port(struct sockaddr_storage *ss)
-{
-	switch (ss->ss_family) {
-	case AF_INET:
-		return ntohs(((struct sockaddr_in *)ss)->sin_port);
-	case AF_INET6:
-		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
-	}
-	return 0;
-}
-
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
-	switch (ss->ss_family) {
-	case AF_INET:
-		((struct sockaddr_in *)ss)->sin_port = htons(p);
-	case AF_INET6:
-		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
-	}
-}
-
-/*
- * Parse an ip[:port] list into an addr array.  Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
-		   struct ceph_entity_addr *addr,
-		   int max_count, int *count)
-{
-	int i;
-	const char *p = c;
-
-	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
-	for (i = 0; i < max_count; i++) {
-		const char *ipend;
-		struct sockaddr_storage *ss = &addr[i].in_addr;
-		struct sockaddr_in *in4 = (void *)ss;
-		struct sockaddr_in6 *in6 = (void *)ss;
-		int port;
-		char delim = ',';
-
-		if (*p == '[') {
-			delim = ']';
-			p++;
-		}
-
-		memset(ss, 0, sizeof(*ss));
-		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-			     delim, &ipend))
-			ss->ss_family = AF_INET;
-		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-				  delim, &ipend))
-			ss->ss_family = AF_INET6;
-		else
-			goto bad;
-		p = ipend;
-
-		if (delim == ']') {
-			if (*p != ']') {
-				dout("missing matching ']'\n");
-				goto bad;
-			}
-			p++;
-		}
-
-		/* port? */
-		if (p < end && *p == ':') {
-			port = 0;
-			p++;
-			while (p < end && *p >= '0' && *p <= '9') {
-				port = (port * 10) + (*p - '0');
-				p++;
-			}
-			if (port > 65535 || port == 0)
-				goto bad;
-		} else {
-			port = CEPH_MON_PORT;
-		}
-
-		addr_set_port(ss, port);
-
-		dout("parse_ips got %s\n", pr_addr(ss));
-
-		if (p == end)
-			break;
-		if (*p != ',')
-			goto bad;
-		p++;
-	}
-
-	if (p != end)
-		goto bad;
-
-	if (count)
-		*count = i + 1;
-	return 0;
-
-bad:
-	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
-	return -EINVAL;
-}
-
-static int process_banner(struct ceph_connection *con)
-{
-	dout("process_banner on %p\n", con);
-
-	if (verify_hello(con) < 0)
-		return -1;
-
-	ceph_decode_addr(&con->actual_peer_addr);
-	ceph_decode_addr(&con->peer_addr_for_me);
-
-	/*
-	 * Make sure the other end is who we wanted.  note that the other
-	 * end may not yet know their ip address, so if it's 0.0.0.0, give
-	 * them the benefit of the doubt.
-	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-		   sizeof(con->peer_addr)) != 0 &&
-	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warning("wrong peer, want %s/%d, got %s/%d\n",
-			   pr_addr(&con->peer_addr.in_addr),
-			   (int)le32_to_cpu(con->peer_addr.nonce),
-			   pr_addr(&con->actual_peer_addr.in_addr),
-			   (int)le32_to_cpu(con->actual_peer_addr.nonce));
-		con->error_msg = "wrong peer at address";
-		return -1;
-	}
-
-	/*
-	 * did we learn our address?
-	 */
-	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
-		int port = addr_port(&con->msgr->inst.addr.in_addr);
-
-		memcpy(&con->msgr->inst.addr.in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(&con->msgr->inst.addr.in_addr, port);
-		encode_my_addr(con->msgr);
-		dout("process_banner learned my addr is %s\n",
-		     pr_addr(&con->msgr->inst.addr.in_addr));
-	}
-
-	set_bit(NEGOTIATING, &con->state);
-	prepare_read_connect(con);
-	return 0;
-}
-
-static void fail_protocol(struct ceph_connection *con)
-{
-	reset_connection(con);
-	set_bit(CLOSED, &con->state);  /* in case there's queued work */
-
-	mutex_unlock(&con->mutex);
-	if (con->ops->bad_proto)
-		con->ops->bad_proto(con);
-	mutex_lock(&con->mutex);
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-	u64 req_feat = CEPH_FEATURE_REQUIRED;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
-
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-	switch (con->in_reply.tag) {
-	case CEPH_MSGR_TAG_FEATURES:
-		pr_err("%s%lld %s feature set mismatch,"
-		       " my %llx < server's %llx, missing %llx\n",
-		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr),
-		       sup_feat, server_feat, server_feat & ~sup_feat);
-		con->error_msg = "missing required protocol features";
-		fail_protocol(con);
-		return -1;
-
-	case CEPH_MSGR_TAG_BADPROTOVER:
-		pr_err("%s%lld %s protocol version mismatch,"
-		       " my %d != server's %d\n",
-		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
-		con->error_msg = "protocol version mismatch";
-		fail_protocol(con);
-		return -1;
-
-	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
-		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
-			con->error_msg = "connect authorization failure";
-			reset_connection(con);
-			set_bit(CLOSED, &con->state);
-			return -1;
-		}
-		con->auth_retry = 1;
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RESETSESSION:
-		/*
-		 * If we connected with a large connect_seq but the peer
-		 * has no record of a session with us (no connection, or
-		 * connect_seq == 0), they will send RESETSESION to indicate
-		 * that they must have reset their session, and may have
-		 * dropped messages.
-		 */
-		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_connect.connect_seq));
-		pr_err("%s%lld %s connection reset\n",
-		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr));
-		reset_connection(con);
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-
-		/* Tell ceph about it. */
-		mutex_unlock(&con->mutex);
-		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
-		if (con->ops->peer_reset)
-			con->ops->peer_reset(con);
-		mutex_lock(&con->mutex);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_SESSION:
-		/*
-		 * If we sent a smaller connect_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_connect.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_GLOBAL:
-		/*
-		 * If we sent a smaller global_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_connect.global_seq));
-		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_connect.global_seq));
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_READY:
-		if (req_feat & ~server_feat) {
-			pr_err("%s%lld %s protocol feature mismatch,"
-			       " my required %llx > server's %llx, need %llx\n",
-			       ENTITY_NAME(con->peer_name),
-			       pr_addr(&con->peer_addr.in_addr),
-			       req_feat, server_feat, req_feat & ~server_feat);
-			con->error_msg = "missing required protocol features";
-			fail_protocol(con);
-			return -1;
-		}
-		clear_bit(CONNECTING, &con->state);
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
-		con->peer_features = server_feat;
-		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
-
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			set_bit(LOSSYTX, &con->state);
-
-		prepare_read_tag(con);
-		break;
-
-	case CEPH_MSGR_TAG_WAIT:
-		/*
-		 * If there is a connection race (we are opening
-		 * connections to each other), one of us may just have
-		 * to WAIT.  This shouldn't happen if we are the
-		 * client.
-		 */
-		pr_err("process_connect peer connecting WAIT\n");
-
-	default:
-		pr_err("connect protocol error, will retry\n");
-		con->error_msg = "protocol error, garbage tag during connect";
-		return -1;
-	}
-	return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-	int to = 0;
-
-	return read_partial(con, &to, sizeof(con->in_temp_ack),
-			    &con->in_temp_ack);
-}
-
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u64 ack = le64_to_cpu(con->in_temp_ack);
-	u64 seq;
-
-	while (!list_empty(&con->out_sent)) {
-		m = list_first_entry(&con->out_sent, struct ceph_msg,
-				     list_head);
-		seq = le64_to_cpu(m->hdr.seq);
-		if (seq > ack)
-			break;
-		dout("got ack for seq %llu type %d at %p\n", seq,
-		     le16_to_cpu(m->hdr.type), m);
-		ceph_msg_remove(m);
-	}
-	prepare_read_tag(con);
-}
-
-
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section,
-					unsigned int sec_len, u32 *crc)
-{
-	int ret, left;
-
-	BUG_ON(!section);
-
-	while (section->iov_len < sec_len) {
-		BUG_ON(section->iov_base == NULL);
-		left = sec_len - section->iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-				       section->iov_len, left);
-		if (ret <= 0)
-			return ret;
-		section->iov_len += ret;
-		if (section->iov_len == sec_len)
-			*crc = crc32c(0, section->iov_base,
-				      section->iov_len);
-	}
-
-	return 1;
-}
-
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr,
-				int *skip);
-
-
-static int read_partial_message_pages(struct ceph_connection *con,
-				      struct page **pages,
-				      unsigned data_len, int datacrc)
-{
-	void *p;
-	int ret;
-	int left;
-
-	left = min((int)(data_len - con->in_msg_pos.data_pos),
-		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-	/* (page) data */
-	BUG_ON(pages == NULL);
-	p = kmap(pages[con->in_msg_pos.page]);
-	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-			       left);
-	if (ret > 0 && datacrc)
-		con->in_data_crc =
-			crc32c(con->in_data_crc,
-				  p + con->in_msg_pos.page_pos, ret);
-	kunmap(pages[con->in_msg_pos.page]);
-	if (ret <= 0)
-		return ret;
-	con->in_msg_pos.data_pos += ret;
-	con->in_msg_pos.page_pos += ret;
-	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-		con->in_msg_pos.page_pos = 0;
-		con->in_msg_pos.page++;
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_BLOCK
-static int read_partial_message_bio(struct ceph_connection *con,
-				    struct bio **bio_iter, int *bio_seg,
-				    unsigned data_len, int datacrc)
-{
-	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
-	void *p;
-	int ret, left;
-
-	if (IS_ERR(bv))
-		return PTR_ERR(bv);
-
-	left = min((int)(data_len - con->in_msg_pos.data_pos),
-		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
-
-	p = kmap(bv->bv_page) + bv->bv_offset;
-
-	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-			       left);
-	if (ret > 0 && datacrc)
-		con->in_data_crc =
-			crc32c(con->in_data_crc,
-				  p + con->in_msg_pos.page_pos, ret);
-	kunmap(bv->bv_page);
-	if (ret <= 0)
-		return ret;
-	con->in_msg_pos.data_pos += ret;
-	con->in_msg_pos.page_pos += ret;
-	if (con->in_msg_pos.page_pos == bv->bv_len) {
-		con->in_msg_pos.page_pos = 0;
-		iter_bio_next(bio_iter, bio_seg);
-	}
-
-	return ret;
-}
-#endif
-
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->in_msg;
-	int ret;
-	int to, left;
-	unsigned front_len, middle_len, data_len, data_off;
-	int datacrc = con->msgr->nocrc;
-	int skip;
-	u64 seq;
-
-	dout("read_partial_message con %p msg %p\n", con, m);
-
-	/* header */
-	while (con->in_base_pos < sizeof(con->in_hdr)) {
-		left = sizeof(con->in_hdr) - con->in_base_pos;
-		ret = ceph_tcp_recvmsg(con->sock,
-				       (char *)&con->in_hdr + con->in_base_pos,
-				       left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-		if (con->in_base_pos == sizeof(con->in_hdr)) {
-			u32 crc = crc32c(0, (void *)&con->in_hdr,
-				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
-			if (crc != le32_to_cpu(con->in_hdr.crc)) {
-				pr_err("read_partial_message bad hdr "
-				       " crc %u != expected %u\n",
-				       crc, con->in_hdr.crc);
-				return -EBADMSG;
-			}
-		}
-	}
-	front_len = le32_to_cpu(con->in_hdr.front_len);
-	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
-	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-	data_off = le16_to_cpu(con->in_hdr.data_off);
-
-	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
-	if ((s64)seq - (s64)con->in_seq < 1) {
-		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
-			ENTITY_NAME(con->peer_name),
-			pr_addr(&con->peer_addr.in_addr),
-			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof(m->footer);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
-		return 0;
-	} else if ((s64)seq - (s64)con->in_seq > 1) {
-		pr_err("read_partial_message bad seq %lld expected %lld\n",
-		       seq, con->in_seq + 1);
-		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADMSG;
-	}
-
-	/* allocate message? */
-	if (!con->in_msg) {
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     con->in_hdr.front_len, con->in_hdr.data_len);
-		skip = 0;
-		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
-		if (skip) {
-			/* skip this message */
-			dout("alloc_msg said skip message\n");
-			BUG_ON(con->in_msg);
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof(m->footer);
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			con->in_seq++;
-			return 0;
-		}
-		if (!con->in_msg) {
-			con->error_msg =
-				"error allocating memory for incoming message";
-			return -ENOMEM;
-		}
-		m = con->in_msg;
-		m->front.iov_len = 0;    /* haven't read it yet */
-		if (m->middle)
-			m->middle->vec.iov_len = 0;
-
-		con->in_msg_pos.page = 0;
-		if (m->pages)
-			con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
-		else
-			con->in_msg_pos.page_pos = 0;
-		con->in_msg_pos.data_pos = 0;
-	}
-
-	/* front */
-	ret = read_partial_message_section(con, &m->front, front_len,
-					   &con->in_front_crc);
-	if (ret <= 0)
-		return ret;
-
-	/* middle */
-	if (m->middle) {
-		ret = read_partial_message_section(con, &m->middle->vec,
-						   middle_len,
-						   &con->in_middle_crc);
-		if (ret <= 0)
-			return ret;
-	}
-#ifdef CONFIG_BLOCK
-	if (m->bio && !m->bio_iter)
-		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
-#endif
-
-	/* (page) data */
-	while (con->in_msg_pos.data_pos < data_len) {
-		if (m->pages) {
-			ret = read_partial_message_pages(con, m->pages,
-						 data_len, datacrc);
-			if (ret <= 0)
-				return ret;
-#ifdef CONFIG_BLOCK
-		} else if (m->bio) {
-
-			ret = read_partial_message_bio(con,
-						 &m->bio_iter, &m->bio_seg,
-						 data_len, datacrc);
-			if (ret <= 0)
-				return ret;
-#endif
-		} else {
-			BUG_ON(1);
-		}
-	}
-
-	/* footer */
-	to = sizeof(m->hdr) + sizeof(m->footer);
-	while (con->in_base_pos < to) {
-		left = to - con->in_base_pos;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
-				       (con->in_base_pos - sizeof(m->hdr)),
-				       left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-	     m, front_len, m->footer.front_crc, middle_len,
-	     m->footer.middle_crc, data_len, m->footer.data_crc);
-
-	/* crc ok? */
-	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-		pr_err("read_partial_message %p front crc %u != exp. %u\n",
-		       m, con->in_front_crc, m->footer.front_crc);
-		return -EBADMSG;
-	}
-	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-		pr_err("read_partial_message %p middle crc %u != exp %u\n",
-		       m, con->in_middle_crc, m->footer.middle_crc);
-		return -EBADMSG;
-	}
-	if (datacrc &&
-	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-		return -EBADMSG;
-	}
-
-	return 1; /* done! */
-}
-
-/*
- * Process message.  This happens in the worker thread.  The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
-	struct ceph_msg *msg;
-
-	msg = con->in_msg;
-	con->in_msg = NULL;
-
-	/* if first message, set peer_name */
-	if (con->peer_name.type == 0)
-		con->peer_name = msg->hdr.src;
-
-	con->in_seq++;
-	mutex_unlock(&con->mutex);
-
-	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
-	     msg, le64_to_cpu(msg->hdr.seq),
-	     ENTITY_NAME(msg->hdr.src),
-	     le16_to_cpu(msg->hdr.type),
-	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-	     le32_to_cpu(msg->hdr.front_len),
-	     le32_to_cpu(msg->hdr.data_len),
-	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
-	con->ops->dispatch(con, msg);
-
-	mutex_lock(&con->mutex);
-	prepare_read_tag(con);
-}
-
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
-	struct ceph_messenger *msgr = con->msgr;
-	int ret = 1;
-
-	dout("try_write start %p state %lu nref %d\n", con, con->state,
-	     atomic_read(&con->nref));
-
-more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-
-	/* open the socket first? */
-	if (con->sock == NULL) {
-		/*
-		 * if we were STANDBY and are reconnecting _this_
-		 * connection, bump connect_seq now.  Always bump
-		 * global_seq.
-		 */
-		if (test_and_clear_bit(STANDBY, &con->state))
-			con->connect_seq++;
-
-		prepare_write_banner(msgr, con);
-		prepare_write_connect(msgr, con, 1);
-		prepare_read_banner(con);
-		set_bit(CONNECTING, &con->state);
-		clear_bit(NEGOTIATING, &con->state);
-
-		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %lu\n",
-		     con, con->state);
-		con->sock = ceph_tcp_connect(con);
-		if (IS_ERR(con->sock)) {
-			con->sock = NULL;
-			con->error_msg = "connect error";
-			ret = -1;
-			goto out;
-		}
-	}
-
-more_kvec:
-	/* kvec data queued? */
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
-		if (ret <= 0)
-			goto done;
-		if (ret < 0) {
-			dout("try_write write_partial_skip err %d\n", ret);
-			goto done;
-		}
-	}
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
-		if (ret <= 0)
-			goto done;
-	}
-
-	/* msg pages? */
-	if (con->out_msg) {
-		if (con->out_msg_done) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;   /* we're done with this one */
-			goto do_next;
-		}
-
-		ret = write_partial_msg_pages(con);
-		if (ret == 1)
-			goto more_kvec;  /* we need to send the footer, too! */
-		if (ret == 0)
-			goto done;
-		if (ret < 0) {
-			dout("try_write write_partial_msg_pages err %d\n",
-			     ret);
-			goto done;
-		}
-	}
-
-do_next:
-	if (!test_bit(CONNECTING, &con->state)) {
-		/* is anything else pending? */
-		if (!list_empty(&con->out_queue)) {
-			prepare_write_message(con);
-			goto more;
-		}
-		if (con->in_seq > con->in_seq_acked) {
-			prepare_write_ack(con);
-			goto more;
-		}
-		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
-	}
-
-	/* Nothing to do! */
-	clear_bit(WRITE_PENDING, &con->state);
-	dout("try_write nothing else to write.\n");
-done:
-	ret = 0;
-out:
-	dout("try_write done on %p\n", con);
-	return ret;
-}
-
-
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
-	int ret = -1;
-
-	if (!con->sock)
-		return 0;
-
-	if (test_bit(STANDBY, &con->state))
-		return 0;
-
-	dout("try_read start on %p\n", con);
-
-more:
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
-	if (test_bit(CONNECTING, &con->state)) {
-		if (!test_bit(NEGOTIATING, &con->state)) {
-			dout("try_read connecting\n");
-			ret = read_partial_banner(con);
-			if (ret <= 0)
-				goto done;
-			if (process_banner(con) < 0) {
-				ret = -1;
-				goto out;
-			}
-		}
-		ret = read_partial_connect(con);
-		if (ret <= 0)
-			goto done;
-		if (process_connect(con) < 0) {
-			ret = -1;
-			goto out;
-		}
-		goto more;
-	}
-
-	if (con->in_base_pos < 0) {
-		/*
-		 * skipping + discarding content.
-		 *
-		 * FIXME: there must be a better way to do this!
-		 */
-		static char buf[1024];
-		int skip = min(1024, -con->in_base_pos);
-		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
-		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
-		if (ret <= 0)
-			goto done;
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
-			goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
-		/*
-		 * what's next?
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-		if (ret <= 0)
-			goto done;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
-		case CEPH_MSGR_TAG_MSG:
-			prepare_read_message(con);
-			break;
-		case CEPH_MSGR_TAG_ACK:
-			prepare_read_ack(con);
-			break;
-		case CEPH_MSGR_TAG_CLOSE:
-			set_bit(CLOSED, &con->state);   /* fixme */
-			goto done;
-		default:
-			goto bad_tag;
-		}
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-		ret = read_partial_message(con);
-		if (ret <= 0) {
-			switch (ret) {
-			case -EBADMSG:
-				con->error_msg = "bad crc";
-				ret = -EIO;
-				goto out;
-			case -EIO:
-				con->error_msg = "io error";
-				goto out;
-			default:
-				goto done;
-			}
-		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
-			goto more;
-		process_message(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
-		ret = read_partial_ack(con);
-		if (ret <= 0)
-			goto done;
-		process_ack(con);
-		goto more;
-	}
-
-done:
-	ret = 0;
-out:
-	dout("try_read done on %p\n", con);
-	return ret;
-
-bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-	con->error_msg = "protocol error, garbage tag";
-	ret = -1;
-	goto out;
-}
-
-
-/*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
-	if (test_bit(DEAD, &con->state)) {
-		dout("queue_con %p ignoring: DEAD\n",
-		     con);
-		return;
-	}
-
-	if (!con->ops->get(con)) {
-		dout("queue_con %p ref count 0\n", con);
-		return;
-	}
-
-	set_bit(QUEUED, &con->state);
-	if (test_bit(BUSY, &con->state)) {
-		dout("queue_con %p - already BUSY\n", con);
-		con->ops->put(con);
-	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
-		dout("queue_con %p - already queued\n", con);
-		con->ops->put(con);
-	} else {
-		dout("queue_con %p\n", con);
-	}
-}
-
-/*
- * Do some work on a connection.  Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
-	struct ceph_connection *con = container_of(work, struct ceph_connection,
-						   work.work);
-	int backoff = 0;
-
-more:
-	if (test_and_set_bit(BUSY, &con->state) != 0) {
-		dout("con_work %p BUSY already set\n", con);
-		goto out;
-	}
-	dout("con_work %p start, clearing QUEUED\n", con);
-	clear_bit(QUEUED, &con->state);
-
-	mutex_lock(&con->mutex);
-
-	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
-		dout("con_work CLOSED\n");
-		con_close_socket(con);
-		goto done;
-	}
-	if (test_and_clear_bit(OPENING, &con->state)) {
-		/* reopen w/ new peer */
-		dout("con_work OPENING\n");
-		con_close_socket(con);
-	}
-
-	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
-	    try_read(con) < 0 ||
-	    try_write(con) < 0) {
-		mutex_unlock(&con->mutex);
-		backoff = 1;
-		ceph_fault(con);     /* error/fault path */
-		goto done_unlocked;
-	}
-
-done:
-	mutex_unlock(&con->mutex);
-
-done_unlocked:
-	clear_bit(BUSY, &con->state);
-	dout("con->state=%lu\n", con->state);
-	if (test_bit(QUEUED, &con->state)) {
-		if (!backoff || test_bit(OPENING, &con->state)) {
-			dout("con_work %p QUEUED reset, looping\n", con);
-			goto more;
-		}
-		dout("con_work %p QUEUED reset, but just faulted\n", con);
-		clear_bit(QUEUED, &con->state);
-	}
-	dout("con_work %p done\n", con);
-
-out:
-	con->ops->put(con);
-}
-
-
-/*
- * Generic error/fault handler.  A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
-	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
-	dout("fault %p state %lu to peer %s\n",
-	     con, con->state, pr_addr(&con->peer_addr.in_addr));
-
-	if (test_bit(LOSSYTX, &con->state)) {
-		dout("fault on LOSSYTX channel\n");
-		goto out;
-	}
-
-	mutex_lock(&con->mutex);
-	if (test_bit(CLOSED, &con->state))
-		goto out_unlock;
-
-	con_close_socket(con);
-
-	if (con->in_msg) {
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-
-	/* Requeue anything that hasn't been acked */
-	list_splice_init(&con->out_sent, &con->out_queue);
-
-	/* If there are no messages in the queue, place the connection
-	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
-	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-		dout("fault setting STANDBY\n");
-		set_bit(STANDBY, &con->state);
-	} else {
-		/* retry after a delay. */
-		if (con->delay == 0)
-			con->delay = BASE_DELAY_INTERVAL;
-		else if (con->delay < MAX_DELAY_INTERVAL)
-			con->delay *= 2;
-		dout("fault queueing %p delay %lu\n", con, con->delay);
-		con->ops->get(con);
-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay)) == 0)
-			con->ops->put(con);
-	}
-
-out_unlock:
-	mutex_unlock(&con->mutex);
-out:
-	/*
-	 * in case we faulted due to authentication, invalidate our
-	 * current tickets so that we can get new ones.
-	 */
-	if (con->auth_retry && con->ops->invalidate_authorizer) {
-		dout("calling invalidate_authorizer()\n");
-		con->ops->invalidate_authorizer(con);
-	}
-
-	if (con->ops->fault)
-		con->ops->fault(con);
-}
-
-
-
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
-	struct ceph_messenger *msgr;
-
-	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
-	if (msgr == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	spin_lock_init(&msgr->global_seq_lock);
-
-	/* the zero page is needed if a request is "canceled" while the message
-	 * is being written over the socket */
-	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-	if (!msgr->zero_page) {
-		kfree(msgr);
-		return ERR_PTR(-ENOMEM);
-	}
-	kmap(msgr->zero_page);
-
-	if (myaddr)
-		msgr->inst.addr = *myaddr;
-
-	/* select a random nonce */
-	msgr->inst.addr.type = 0;
-	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
-	encode_my_addr(msgr);
-
-	dout("messenger_create %p\n", msgr);
-	return msgr;
-}
-
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
-	dout("destroy %p\n", msgr);
-	kunmap(msgr->zero_page);
-	__free_page(msgr->zero_page);
-	kfree(msgr);
-	dout("destroyed messenger %p\n", msgr);
-}
-
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	if (test_bit(CLOSED, &con->state)) {
-		dout("con_send %p closed, dropping %p\n", con, msg);
-		ceph_msg_put(msg);
-		return;
-	}
-
-	/* set src+dst */
-	msg->hdr.src = con->msgr->inst.name;
-
-	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-
-	msg->needs_out_seq = true;
-
-	/* queue */
-	mutex_lock(&con->mutex);
-	BUG_ON(!list_empty(&msg->list_head));
-	list_add_tail(&msg->list_head, &con->out_queue);
-	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
-	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
-	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-	     le32_to_cpu(msg->hdr.front_len),
-	     le32_to_cpu(msg->hdr.middle_len),
-	     le32_to_cpu(msg->hdr.data_len));
-	mutex_unlock(&con->mutex);
-
-	/* if there wasn't anything waiting to send before, queue
-	 * new work */
-	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-		queue_con(con);
-}
-
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	mutex_lock(&con->mutex);
-	if (!list_empty(&msg->list_head)) {
-		dout("con_revoke %p msg %p - was on queue\n", con, msg);
-		list_del_init(&msg->list_head);
-		ceph_msg_put(msg);
-		msg->hdr.seq = 0;
-	}
-	if (con->out_msg == msg) {
-		dout("con_revoke %p msg %p - was sending\n", con, msg);
-		con->out_msg = NULL;
-		if (con->out_kvec_is_msg) {
-			con->out_skip = con->out_kvec_bytes;
-			con->out_kvec_is_msg = false;
-		}
-		ceph_msg_put(msg);
-		msg->hdr.seq = 0;
-	}
-	mutex_unlock(&con->mutex);
-}
-
-/*
- * Revoke a message that we may be reading data into
- */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	mutex_lock(&con->mutex);
-	if (con->in_msg && con->in_msg == msg) {
-		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
-		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
-		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
-
-		/* skip rest of message */
-		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
-			con->in_base_pos = con->in_base_pos -
-				sizeof(struct ceph_msg_header) -
-				front_len -
-				middle_len -
-				data_len -
-				sizeof(struct ceph_msg_footer);
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
-	} else {
-		dout("con_revoke_pages %p msg %p pages %p no-op\n",
-		     con, con->in_msg, msg);
-	}
-	mutex_unlock(&con->mutex);
-}
-
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
-	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
-	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-		queue_con(con);
-}
-
-
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-{
-	struct ceph_msg *m;
-
-	m = kmalloc(sizeof(*m), flags);
-	if (m == NULL)
-		goto out;
-	kref_init(&m->kref);
-	INIT_LIST_HEAD(&m->list_head);
-
-	m->hdr.tid = 0;
-	m->hdr.type = cpu_to_le16(type);
-	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
-	m->hdr.version = 0;
-	m->hdr.front_len = cpu_to_le32(front_len);
-	m->hdr.middle_len = 0;
-	m->hdr.data_len = 0;
-	m->hdr.data_off = 0;
-	m->hdr.reserved = 0;
-	m->footer.front_crc = 0;
-	m->footer.middle_crc = 0;
-	m->footer.data_crc = 0;
-	m->footer.flags = 0;
-	m->front_max = front_len;
-	m->front_is_vmalloc = false;
-	m->more_to_follow = false;
-	m->pool = NULL;
-
-	/* front */
-	if (front_len) {
-		if (front_len > PAGE_CACHE_SIZE) {
-			m->front.iov_base = __vmalloc(front_len, flags,
-						      PAGE_KERNEL);
-			m->front_is_vmalloc = true;
-		} else {
-			m->front.iov_base = kmalloc(front_len, flags);
-		}
-		if (m->front.iov_base == NULL) {
-			pr_err("msg_new can't allocate %d bytes\n",
-			     front_len);
-			goto out2;
-		}
-	} else {
-		m->front.iov_base = NULL;
-	}
-	m->front.iov_len = front_len;
-
-	/* middle */
-	m->middle = NULL;
-
-	/* data */
-	m->nr_pages = 0;
-	m->pages = NULL;
-	m->pagelist = NULL;
-	m->bio = NULL;
-	m->bio_iter = NULL;
-	m->bio_seg = 0;
-	m->trail = NULL;
-
-	dout("ceph_msg_new %p front %d\n", m, front_len);
-	return m;
-
-out2:
-	ceph_msg_put(m);
-out:
-	pr_err("msg_new can't create type %d front %d\n", type, front_len);
-	return NULL;
-}
-
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg.  This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	int type = le16_to_cpu(msg->hdr.type);
-	int middle_len = le32_to_cpu(msg->hdr.middle_len);
-
-	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
-	     ceph_msg_type_name(type), middle_len);
-	BUG_ON(!middle_len);
-	BUG_ON(msg->middle);
-
-	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
-	if (!msg->middle)
-		return -ENOMEM;
-	return 0;
-}
-
-/*
- * Generic message allocator, for incoming messages.
- */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr,
-				int *skip)
-{
-	int type = le16_to_cpu(hdr->type);
-	int front_len = le32_to_cpu(hdr->front_len);
-	int middle_len = le32_to_cpu(hdr->middle_len);
-	struct ceph_msg *msg = NULL;
-	int ret;
-
-	if (con->ops->alloc_msg) {
-		mutex_unlock(&con->mutex);
-		msg = con->ops->alloc_msg(con, hdr, skip);
-		mutex_lock(&con->mutex);
-		if (!msg || *skip)
-			return NULL;
-	}
-	if (!msg) {
-		*skip = 0;
-		msg = ceph_msg_new(type, front_len, GFP_NOFS);
-		if (!msg) {
-			pr_err("unable to allocate msg type %d len %d\n",
-			       type, front_len);
-			return NULL;
-		}
-	}
-	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-
-	if (middle_len && !msg->middle) {
-		ret = ceph_alloc_middle(con, msg);
-		if (ret < 0) {
-			ceph_msg_put(msg);
-			return NULL;
-		}
-	}
-
-	return msg;
-}
-
-
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
-	dout("msg_kfree %p\n", m);
-	if (m->front_is_vmalloc)
-		vfree(m->front.iov_base);
-	else
-		kfree(m->front.iov_base);
-	kfree(m);
-}
-
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
-{
-	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-
-	dout("ceph_msg_put last one on %p\n", m);
-	WARN_ON(!list_empty(&m->list_head));
-
-	/* drop middle, data, if any */
-	if (m->middle) {
-		ceph_buffer_put(m->middle);
-		m->middle = NULL;
-	}
-	m->nr_pages = 0;
-	m->pages = NULL;
-
-	if (m->pagelist) {
-		ceph_pagelist_release(m->pagelist);
-		kfree(m->pagelist);
-		m->pagelist = NULL;
-	}
-
-	m->trail = NULL;
-
-	if (m->pool)
-		ceph_msgpool_put(m->pool, m);
-	else
-		ceph_msg_kfree(m);
-}
-
-void ceph_msg_dump(struct ceph_msg *msg)
-{
-	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-		 msg->front_max, msg->nr_pages);
-	print_hex_dump(KERN_DEBUG, "header: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       &msg->hdr, sizeof(msg->hdr), true);
-	print_hex_dump(KERN_DEBUG, " front: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       msg->front.iov_base, msg->front.iov_len, true);
-	if (msg->middle)
-		print_hex_dump(KERN_DEBUG, "middle: ",
-			       DUMP_PREFIX_OFFSET, 16, 1,
-			       msg->middle->vec.iov_base,
-			       msg->middle->vec.iov_len, true);
-	print_hex_dump(KERN_DEBUG, "footer: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       &msg->footer, sizeof(msg->footer), true);
-}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 5a79450604ef..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,257 +0,0 @@
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-
-#include <linux/kref.h>
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-
-#include "types.h"
-#include "buffer.h"
-
-struct ceph_msg;
-struct ceph_connection;
-
-extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
-
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
-	struct ceph_connection *(*get)(struct ceph_connection *);
-	void (*put)(struct ceph_connection *);
-
-	/* handle an incoming message. */
-	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-
-	/* authorize an outgoing connection */
-	int (*get_authorizer) (struct ceph_connection *con,
-			       void **buf, int *len, int *proto,
-			       void **reply_buf, int *reply_len, int force_new);
-	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
-	int (*invalidate_authorizer)(struct ceph_connection *con);
-
-	/* protocol version mismatch */
-	void (*bad_proto) (struct ceph_connection *con);
-
-	/* there was some error on the socket (disconnect, whatever) */
-	void (*fault) (struct ceph_connection *con);
-
-	/* a remote host as terminated a message exchange session, and messages
-	 * we sent (or they tried to send us) may be lost. */
-	void (*peer_reset) (struct ceph_connection *con);
-
-	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
-					struct ceph_msg_header *hdr,
-					int *skip);
-};
-
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
-
-struct ceph_messenger {
-	struct ceph_entity_inst inst;    /* my name+address */
-	struct ceph_entity_addr my_enc_addr;
-	struct page *zero_page;          /* used in certain error cases */
-
-	bool nocrc;
-
-	/*
-	 * the global_seq counts connections i (attempt to) initiate
-	 * in order to disambiguate certain connect race conditions.
-	 */
-	u32 global_seq;
-	spinlock_t global_seq_lock;
-};
-
-/*
- * a single message.  it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
-	struct ceph_msg_header hdr;	/* header */
-	struct ceph_msg_footer footer;	/* footer */
-	struct kvec front;              /* unaligned blobs of message */
-	struct ceph_buffer *middle;
-	struct page **pages;            /* data payload.  NOT OWNER. */
-	unsigned nr_pages;              /* size of page array */
-	struct ceph_pagelist *pagelist; /* instead of pages */
-	struct list_head list_head;
-	struct kref kref;
-	struct bio  *bio;		/* instead of pages/pagelist */
-	struct bio  *bio_iter;		/* bio iterator */
-	int bio_seg;			/* current bio segment */
-	struct ceph_pagelist *trail;	/* the trailing part of the data */
-	bool front_is_vmalloc;
-	bool more_to_follow;
-	bool needs_out_seq;
-	int front_max;
-
-	struct ceph_msgpool *pool;
-};
-
-struct ceph_msg_pos {
-	int page, page_pos;  /* which page; offset in page */
-	int data_pos;        /* offset in data payload */
-	int did_page_crc;    /* true if we've calculated crc for current page */
-};
-
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL	(HZ/2)
-#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
-
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define CONNECTING	1
-#define NEGOTIATING	2
-#define KEEPALIVE_PENDING      3
-#define WRITE_PENDING	4  /* we have data ready to send */
-#define QUEUED          5  /* there is work queued on this connection */
-#define BUSY            6  /* work is being done */
-#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
-			    * the ceph_connection around to maintain shared
-			    * state with the peer. */
-#define CLOSED		10 /* we've closed the connection */
-#define SOCK_CLOSED	11 /* socket state changed to closed */
-#define OPENING         13 /* open connection w/ (possibly new) peer */
-#define DEAD            14 /* dead, about to kfree */
-
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
-	void *private;
-	atomic_t nref;
-
-	const struct ceph_connection_operations *ops;
-
-	struct ceph_messenger *msgr;
-	struct socket *sock;
-	unsigned long state;	/* connection state (see flags above) */
-	const char *error_msg;  /* error message, if any */
-
-	struct ceph_entity_addr peer_addr; /* peer address */
-	struct ceph_entity_name peer_name; /* peer name */
-	struct ceph_entity_addr peer_addr_for_me;
-	unsigned peer_features;
-	u32 connect_seq;      /* identify the most recent connection
-				 attempt for this connection, client */
-	u32 peer_global_seq;  /* peer's global seq for this connection */
-
-	int auth_retry;       /* true if we need a newer authorizer */
-	void *auth_reply_buf;   /* where to put the authorizer reply */
-	int auth_reply_buf_len;
-
-	struct mutex mutex;
-
-	/* out queue */
-	struct list_head out_queue;
-	struct list_head out_sent;   /* sending or sent but unacked */
-	u64 out_seq;		     /* last message queued for send */
-	bool out_keepalive_pending;
-
-	u64 in_seq, in_seq_acked;  /* last message received, acked */
-
-	/* connection negotiation temps */
-	char in_banner[CEPH_BANNER_MAX_LEN];
-	union {
-		struct {  /* outgoing connection */
-			struct ceph_msg_connect out_connect;
-			struct ceph_msg_connect_reply in_reply;
-		};
-		struct {  /* incoming */
-			struct ceph_msg_connect in_connect;
-			struct ceph_msg_connect_reply out_reply;
-		};
-	};
-	struct ceph_entity_addr actual_peer_addr;
-
-	/* message out temps */
-	struct ceph_msg *out_msg;        /* sending message (== tail of
-					    out_sent) */
-	bool out_msg_done;
-	struct ceph_msg_pos out_msg_pos;
-
-	struct kvec out_kvec[8],         /* sending header/footer data */
-		*out_kvec_cur;
-	int out_kvec_left;   /* kvec's left in out_kvec */
-	int out_skip;        /* skip this many bytes */
-	int out_kvec_bytes;  /* total bytes left */
-	bool out_kvec_is_msg; /* kvec refers to out_msg */
-	int out_more;        /* there is more data after the kvecs */
-	__le64 out_temp_ack; /* for writing an ack */
-
-	/* message in temps */
-	struct ceph_msg_header in_hdr;
-	struct ceph_msg *in_msg;
-	struct ceph_msg_pos in_msg_pos;
-	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
-
-	char in_tag;         /* protocol control byte */
-	int in_base_pos;     /* bytes read */
-	__le64 in_temp_ack;  /* for reading an ack */
-
-	struct delayed_work work;	    /* send|recv work */
-	unsigned long       delay;          /* current delay interval */
-};
-
-
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
-			  struct ceph_entity_addr *addr,
-			  int max_count, int *count);
-
-
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-extern void ceph_msgr_flush(void);
-
-extern struct ceph_messenger *ceph_messenger_create(
-	struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-
-extern void ceph_con_init(struct ceph_messenger *msgr,
-			  struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
-			  struct ceph_entity_addr *addr);
-extern bool ceph_con_opened(struct ceph_connection *con);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
-				  struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-	kref_get(&msg->kref);
-	return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-	kref_put(&msg->kref, ceph_msg_last_put);
-}
-
-extern void ceph_msg_dump(struct ceph_msg *msg);
-
-#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
-
-/*
- * Interact with Ceph monitor cluster.  Handle requests for new map
- * versions, and periodically resend as needed.  Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information.  An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates.  We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure.  If the connection does break, we
- * randomly hunt for a new monitor.  Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-
-static const struct ceph_connection_operations mon_con_ops;
-
-static int __validate_auth(struct ceph_mon_client *monc);
-
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
-	struct ceph_monmap *m = NULL;
-	int i, err = -EINVAL;
-	struct ceph_fsid fsid;
-	u32 epoch, num_mon;
-	u16 version;
-	u32 len;
-
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
-
-	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-
-	ceph_decode_16_safe(&p, end, version, bad);
-
-	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(&p);
-
-	num_mon = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-
-	if (num_mon >= CEPH_MAX_MON)
-		goto bad;
-	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
-	if (m == NULL)
-		return ERR_PTR(-ENOMEM);
-	m->fsid = fsid;
-	m->epoch = epoch;
-	m->num_mon = num_mon;
-	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-	for (i = 0; i < num_mon; i++)
-		ceph_decode_addr(&m->mon_inst[i].addr);
-
-	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-	     m->num_mon);
-	for (i = 0; i < m->num_mon; i++)
-		dout("monmap_decode  mon%d is %s\n", i,
-		     pr_addr(&m->mon_inst[i].addr.in_addr));
-	return m;
-
-bad:
-	dout("monmap_decode failed with %d\n", err);
-	kfree(m);
-	return ERR_PTR(err);
-}
-
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
-	int i;
-
-	for (i = 0; i < m->num_mon; i++)
-		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
-			return 1;
-	return 0;
-}
-
-/*
- * Send an auth request.
- */
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
-	monc->pending_auth = 1;
-	monc->m_auth->front.iov_len = len;
-	monc->m_auth->hdr.front_len = cpu_to_le32(len);
-	ceph_con_revoke(monc->con, monc->m_auth);
-	ceph_msg_get(monc->m_auth);  /* keep our ref */
-	ceph_con_send(monc->con, monc->m_auth);
-}
-
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
-	if (monc->con) {
-		dout("__close_session closing mon%d\n", monc->cur_mon);
-		ceph_con_revoke(monc->con, monc->m_auth);
-		ceph_con_close(monc->con);
-		monc->cur_mon = -1;
-		monc->pending_auth = 0;
-		ceph_auth_reset(monc->auth);
-	}
-}
-
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
-	char r;
-	int ret;
-
-	if (monc->cur_mon < 0) {
-		get_random_bytes(&r, 1);
-		monc->cur_mon = r % monc->monmap->num_mon;
-		dout("open_session num=%d r=%d -> mon%d\n",
-		     monc->monmap->num_mon, r, monc->cur_mon);
-		monc->sub_sent = 0;
-		monc->sub_renew_after = jiffies;  /* i.e., expired */
-		monc->want_next_osdmap = !!monc->want_next_osdmap;
-
-		dout("open_session mon%d opening\n", monc->cur_mon);
-		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
-		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
-		ceph_con_open(monc->con,
-			      &monc->monmap->mon_inst[monc->cur_mon].addr);
-
-		/* initiatiate authentication handshake */
-		ret = ceph_auth_build_hello(monc->auth,
-					    monc->m_auth->front.iov_base,
-					    monc->m_auth->front_max);
-		__send_prepared_auth_request(monc, ret);
-	} else {
-		dout("open_session mon%d already open\n", monc->cur_mon);
-	}
-	return 0;
-}
-
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
-	return time_after_eq(jiffies, monc->sub_renew_after);
-}
-
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
-	unsigned delay;
-
-	if (monc->cur_mon < 0 || __sub_expired(monc))
-		delay = 10 * HZ;
-	else
-		delay = 20 * HZ;
-	dout("__schedule_delayed after %u\n", delay);
-	schedule_delayed_work(&monc->delayed_work, delay);
-}
-
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
-	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
-	     (unsigned)monc->sub_sent, __sub_expired(monc),
-	     monc->want_next_osdmap);
-	if ((__sub_expired(monc) && !monc->sub_sent) ||
-	    monc->want_next_osdmap == 1) {
-		struct ceph_msg *msg = monc->m_subscribe;
-		struct ceph_mon_subscribe_item *i;
-		void *p, *end;
-
-		p = msg->front.iov_base;
-		end = p + msg->front_max;
-
-		dout("__send_subscribe to 'mdsmap' %u+\n",
-		     (unsigned)monc->have_mdsmap);
-		if (monc->want_next_osdmap) {
-			dout("__send_subscribe to 'osdmap' %u\n",
-			     (unsigned)monc->have_osdmap);
-			ceph_encode_32(&p, 3);
-			ceph_encode_string(&p, end, "osdmap", 6);
-			i = p;
-			i->have = cpu_to_le64(monc->have_osdmap);
-			i->onetime = 1;
-			p += sizeof(*i);
-			monc->want_next_osdmap = 2;  /* requested */
-		} else {
-			ceph_encode_32(&p, 2);
-		}
-		ceph_encode_string(&p, end, "mdsmap", 6);
-		i = p;
-		i->have = cpu_to_le64(monc->have_mdsmap);
-		i->onetime = 0;
-		p += sizeof(*i);
-		ceph_encode_string(&p, end, "monmap", 6);
-		i = p;
-		i->have = 0;
-		i->onetime = 0;
-		p += sizeof(*i);
-
-		msg->front.iov_len = p - msg->front.iov_base;
-		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		ceph_con_revoke(monc->con, msg);
-		ceph_con_send(monc->con, ceph_msg_get(msg));
-
-		monc->sub_sent = jiffies | 1;  /* never 0 */
-	}
-}
-
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
-				 struct ceph_msg *msg)
-{
-	unsigned seconds;
-	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
-
-	if (msg->front.iov_len < sizeof(*h))
-		goto bad;
-	seconds = le32_to_cpu(h->duration);
-
-	mutex_lock(&monc->mutex);
-	if (monc->hunting) {
-		pr_info("mon%d %s session established\n",
-			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
-		monc->hunting = false;
-	}
-	dout("handle_subscribe_ack after %d seconds\n", seconds);
-	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
-	monc->sub_sent = 0;
-	mutex_unlock(&monc->mutex);
-	return;
-bad:
-	pr_err("got corrupt subscribe-ack msg\n");
-	ceph_msg_dump(msg);
-}
-
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
-	mutex_lock(&monc->mutex);
-	monc->have_mdsmap = got;
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
-	mutex_lock(&monc->mutex);
-	monc->have_osdmap = got;
-	monc->want_next_osdmap = 0;
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-	dout("request_next_osdmap have %u\n", monc->have_osdmap);
-	mutex_lock(&monc->mutex);
-	if (!monc->want_next_osdmap)
-		monc->want_next_osdmap = 1;
-	if (monc->want_next_osdmap < 2)
-		__send_subscribe(monc);
-	mutex_unlock(&monc->mutex);
-}
-
-/*
- *
- */
-int ceph_monc_open_session(struct ceph_mon_client *monc)
-{
-	if (!monc->con) {
-		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
-		if (!monc->con)
-			return -ENOMEM;
-		ceph_con_init(monc->client->msgr, monc->con);
-		monc->con->private = monc;
-		monc->con->ops = &mon_con_ops;
-	}
-
-	mutex_lock(&monc->mutex);
-	__open_session(monc);
-	__schedule_delayed(monc);
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc,
-				 struct ceph_msg *msg)
-{
-	struct ceph_client *client = monc->client;
-	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
-	void *p, *end;
-
-	mutex_lock(&monc->mutex);
-
-	dout("handle_monmap\n");
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
-
-	monmap = ceph_monmap_decode(p, end);
-	if (IS_ERR(monmap)) {
-		pr_err("problem decoding monmap, %d\n",
-		       (int)PTR_ERR(monmap));
-		goto out;
-	}
-
-	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
-		kfree(monmap);
-		goto out;
-	}
-
-	client->monc.monmap = monmap;
-	kfree(old);
-
-out:
-	mutex_unlock(&monc->mutex);
-	wake_up_all(&client->auth_wq);
-}
-
-/*
- * generic requests (e.g., statfs, poolop)
- */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-	struct ceph_mon_client *monc, u64 tid)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *n = monc->generic_request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mon_generic_request, node);
-		if (tid < req->tid)
-			n = n->rb_left;
-		else if (tid > req->tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-			    struct ceph_mon_generic_request *new)
-{
-	struct rb_node **p = &monc->generic_request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mon_generic_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
-		if (new->tid < req->tid)
-			p = &(*p)->rb_left;
-		else if (new->tid > req->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->generic_request_tree);
-}
-
-static void release_generic_request(struct kref *kref)
-{
-	struct ceph_mon_generic_request *req =
-		container_of(kref, struct ceph_mon_generic_request, kref);
-
-	if (req->reply)
-		ceph_msg_put(req->reply);
-	if (req->request)
-		ceph_msg_put(req->request);
-
-	kfree(req);
-}
-
-static void put_generic_request(struct ceph_mon_generic_request *req)
-{
-	kref_put(&req->kref, release_generic_request);
-}
-
-static void get_generic_request(struct ceph_mon_generic_request *req)
-{
-	kref_get(&req->kref);
-}
-
-static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
-					 struct ceph_msg_header *hdr,
-					 int *skip)
-{
-	struct ceph_mon_client *monc = con->private;
-	struct ceph_mon_generic_request *req;
-	u64 tid = le64_to_cpu(hdr->tid);
-	struct ceph_msg *m;
-
-	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (!req) {
-		dout("get_generic_reply %lld dne\n", tid);
-		*skip = 1;
-		m = NULL;
-	} else {
-		dout("get_generic_reply %lld got %p\n", tid, req->reply);
-		m = ceph_msg_get(req->reply);
-		/*
-		 * we don't need to track the connection reading into
-		 * this reply because we only have one open connection
-		 * at a time, ever.
-		 */
-	}
-	mutex_unlock(&monc->mutex);
-	return m;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-			      struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	/* register request */
-	mutex_lock(&monc->mutex);
-	req->tid = ++monc->last_tid;
-	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_generic_request(monc, req);
-	monc->num_generic_requests++;
-	ceph_con_send(monc->con, ceph_msg_get(req->request));
-	mutex_unlock(&monc->mutex);
-
-	err = wait_for_completion_interruptible(&req->completion);
-
-	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->generic_request_tree);
-	monc->num_generic_requests--;
-	mutex_unlock(&monc->mutex);
-
-	if (!err)
-		err = req->result;
-	return err;
-}
-
-/*
- * statfs
- */
-static void handle_statfs_reply(struct ceph_mon_client *monc,
-				struct ceph_msg *msg)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-	u64 tid = le64_to_cpu(msg->hdr.tid);
-
-	if (msg->front.iov_len != sizeof(*reply))
-		goto bad;
-	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-
-	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (req) {
-		*(struct ceph_statfs *)req->buf = reply->st;
-		req->result = 0;
-		get_generic_request(req);
-	}
-	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
-	return;
-
-bad:
-	pr_err("corrupt generic reply, tid %llu\n", tid);
-	ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_statfs *h;
-	int err;
-
-	req = kzalloc(sizeof(*req), GFP_NOFS);
-	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	req->buf = buf;
-	req->buf_len = sizeof(*buf);
-	init_completion(&req->completion);
-
-	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
-	if (!req->request)
-		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
-	if (!req->reply)
-		goto out;
-
-	/* fill out request */
-	h = req->request->front.iov_base;
-	h->monhdr.have_version = 0;
-	h->monhdr.session_mon = cpu_to_le16(-1);
-	h->monhdr.session_mon_tid = 0;
-	h->fsid = monc->monmap->fsid;
-
-	err = do_generic_request(monc, req);
-
-out:
-	kref_put(&req->kref, release_generic_request);
-	return err;
-}
-
-/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
-				char *dst, size_t dst_len)
-{
-	u32 buf_len;
-
-	if (src_len != sizeof(u32) + dst_len)
-		return -EINVAL;
-
-	buf_len = le32_to_cpu(*(u32 *)src);
-	if (buf_len != dst_len)
-		return -EINVAL;
-
-	memcpy(dst, src + sizeof(u32), dst_len);
-	return 0;
-}
-
-static void handle_poolop_reply(struct ceph_mon_client *monc,
-				struct ceph_msg *msg)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
-	u64 tid = le64_to_cpu(msg->hdr.tid);
-
-	if (msg->front.iov_len < sizeof(*reply))
-		goto bad;
-	dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-
-	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (req) {
-		if (req->buf_len &&
-		    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
-				     msg->front.iov_len - sizeof(*reply),
-				     req->buf, req->buf_len) < 0) {
-			mutex_unlock(&monc->mutex);
-			goto bad;
-		}
-		req->result = le32_to_cpu(reply->reply_code);
-		get_generic_request(req);
-	}
-	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete(&req->completion);
-		put_generic_request(req);
-	}
-	return;
-
-bad:
-	pr_err("corrupt generic reply, tid %llu\n", tid);
-	ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous pool op.
- */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
-			u32 pool, u64 snapid,
-			char *buf, int len)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_poolop *h;
-	int err;
-
-	req = kzalloc(sizeof(*req), GFP_NOFS);
-	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	req->buf = buf;
-	req->buf_len = len;
-	init_completion(&req->completion);
-
-	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
-	if (!req->request)
-		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
-	if (!req->reply)
-		goto out;
-
-	/* fill out request */
-	req->request->hdr.version = cpu_to_le16(2);
-	h = req->request->front.iov_base;
-	h->monhdr.have_version = 0;
-	h->monhdr.session_mon = cpu_to_le16(-1);
-	h->monhdr.session_mon_tid = 0;
-	h->fsid = monc->monmap->fsid;
-	h->pool = cpu_to_le32(pool);
-	h->op = cpu_to_le32(op);
-	h->auid = 0;
-	h->snapid = cpu_to_le64(snapid);
-	h->name_len = 0;
-
-	err = do_generic_request(monc, req);
-
-out:
-	kref_put(&req->kref, release_generic_request);
-	return err;
-}
-
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-			    u32 pool, u64 *snapid)
-{
-	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-				   pool, 0, (char *)snapid, sizeof(*snapid));
-
-}
-
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-			    u32 pool, u64 snapid)
-{
-	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-				   pool, snapid, 0, 0);
-
-}
-
-/*
- * Resend pending generic requests.
- */
-static void __resend_generic_request(struct ceph_mon_client *monc)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *p;
-
-	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_mon_generic_request, node);
-		ceph_con_revoke(monc->con, req->request);
-		ceph_con_send(monc->con, ceph_msg_get(req->request));
-	}
-}
-
-/*
- * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM).  And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
-	struct ceph_mon_client *monc =
-		container_of(work, struct ceph_mon_client, delayed_work.work);
-
-	dout("monc delayed_work\n");
-	mutex_lock(&monc->mutex);
-	if (monc->hunting) {
-		__close_session(monc);
-		__open_session(monc);  /* continue hunting */
-	} else {
-		ceph_con_keepalive(monc->con);
-
-		__validate_auth(monc);
-
-		if (monc->auth->ops->is_authenticated(monc->auth))
-			__send_subscribe(monc);
-	}
-	__schedule_delayed(monc);
-	mutex_unlock(&monc->mutex);
-}
-
-/*
- * On startup, we build a temporary monmap populated with the IPs
- * provided by mount(2).
- */
-static int build_initial_monmap(struct ceph_mon_client *monc)
-{
-	struct ceph_mount_args *args = monc->client->mount_args;
-	struct ceph_entity_addr *mon_addr = args->mon_addr;
-	int num_mon = args->num_mon;
-	int i;
-
-	/* build initial monmap */
-	monc->monmap = kzalloc(sizeof(*monc->monmap) +
-			       num_mon*sizeof(monc->monmap->mon_inst[0]),
-			       GFP_KERNEL);
-	if (!monc->monmap)
-		return -ENOMEM;
-	for (i = 0; i < num_mon; i++) {
-		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.nonce = 0;
-		monc->monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
-	}
-	monc->monmap->num_mon = num_mon;
-	monc->have_fsid = false;
-
-	/* release addr memory */
-	kfree(args->mon_addr);
-	args->mon_addr = NULL;
-	args->num_mon = 0;
-	return 0;
-}
-
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
-	int err = 0;
-
-	dout("init\n");
-	memset(monc, 0, sizeof(*monc));
-	monc->client = cl;
-	monc->monmap = NULL;
-	mutex_init(&monc->mutex);
-
-	err = build_initial_monmap(monc);
-	if (err)
-		goto out;
-
-	monc->con = NULL;
-
-	/* authentication */
-	monc->auth = ceph_auth_init(cl->mount_args->name,
-				    cl->mount_args->secret);
-	if (IS_ERR(monc->auth))
-		return PTR_ERR(monc->auth);
-	monc->auth->want_keys =
-		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
-		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-
-	/* msgs */
-	err = -ENOMEM;
-	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-				     sizeof(struct ceph_mon_subscribe_ack),
-				     GFP_NOFS);
-	if (!monc->m_subscribe_ack)
-		goto out_monmap;
-
-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-	if (!monc->m_subscribe)
-		goto out_subscribe_ack;
-
-	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-	if (!monc->m_auth_reply)
-		goto out_subscribe;
-
-	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
-	monc->pending_auth = 0;
-	if (!monc->m_auth)
-		goto out_auth_reply;
-
-	monc->cur_mon = -1;
-	monc->hunting = true;
-	monc->sub_renew_after = jiffies;
-	monc->sub_sent = 0;
-
-	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-	monc->generic_request_tree = RB_ROOT;
-	monc->num_generic_requests = 0;
-	monc->last_tid = 0;
-
-	monc->have_mdsmap = 0;
-	monc->have_osdmap = 0;
-	monc->want_next_osdmap = 1;
-	return 0;
-
-out_auth_reply:
-	ceph_msg_put(monc->m_auth_reply);
-out_subscribe:
-	ceph_msg_put(monc->m_subscribe);
-out_subscribe_ack:
-	ceph_msg_put(monc->m_subscribe_ack);
-out_monmap:
-	kfree(monc->monmap);
-out:
-	return err;
-}
-
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
-	dout("stop\n");
-	cancel_delayed_work_sync(&monc->delayed_work);
-
-	mutex_lock(&monc->mutex);
-	__close_session(monc);
-	if (monc->con) {
-		monc->con->private = NULL;
-		monc->con->ops->put(monc->con);
-		monc->con = NULL;
-	}
-	mutex_unlock(&monc->mutex);
-
-	ceph_auth_destroy(monc->auth);
-
-	ceph_msg_put(monc->m_auth);
-	ceph_msg_put(monc->m_auth_reply);
-	ceph_msg_put(monc->m_subscribe);
-	ceph_msg_put(monc->m_subscribe_ack);
-
-	kfree(monc->monmap);
-}
-
-static void handle_auth_reply(struct ceph_mon_client *monc,
-			      struct ceph_msg *msg)
-{
-	int ret;
-	int was_auth = 0;
-
-	mutex_lock(&monc->mutex);
-	if (monc->auth->ops)
-		was_auth = monc->auth->ops->is_authenticated(monc->auth);
-	monc->pending_auth = 0;
-	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-				     msg->front.iov_len,
-				     monc->m_auth->front.iov_base,
-				     monc->m_auth->front_max);
-	if (ret < 0) {
-		monc->client->auth_err = ret;
-		wake_up_all(&monc->client->auth_wq);
-	} else if (ret > 0) {
-		__send_prepared_auth_request(monc, ret);
-	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
-		dout("authenticated, starting session\n");
-
-		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-		monc->client->msgr->inst.name.num =
-					cpu_to_le64(monc->auth->global_id);
-
-		__send_subscribe(monc);
-		__resend_generic_request(monc);
-	}
-	mutex_unlock(&monc->mutex);
-}
-
-static int __validate_auth(struct ceph_mon_client *monc)
-{
-	int ret;
-
-	if (monc->pending_auth)
-		return 0;
-
-	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
-			      monc->m_auth->front_max);
-	if (ret <= 0)
-		return ret; /* either an error, or no need to authenticate */
-	__send_prepared_auth_request(monc, ret);
-	return 0;
-}
-
-int ceph_monc_validate_auth(struct ceph_mon_client *monc)
-{
-	int ret;
-
-	mutex_lock(&monc->mutex);
-	ret = __validate_auth(monc);
-	mutex_unlock(&monc->mutex);
-	return ret;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	struct ceph_mon_client *monc = con->private;
-	int type = le16_to_cpu(msg->hdr.type);
-
-	if (!monc)
-		return;
-
-	switch (type) {
-	case CEPH_MSG_AUTH_REPLY:
-		handle_auth_reply(monc, msg);
-		break;
-
-	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		handle_subscribe_ack(monc, msg);
-		break;
-
-	case CEPH_MSG_STATFS_REPLY:
-		handle_statfs_reply(monc, msg);
-		break;
-
-	case CEPH_MSG_POOLOP_REPLY:
-		handle_poolop_reply(monc, msg);
-		break;
-
-	case CEPH_MSG_MON_MAP:
-		ceph_monc_handle_map(monc, msg);
-		break;
-
-	case CEPH_MSG_MDS_MAP:
-		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-		break;
-
-	case CEPH_MSG_OSD_MAP:
-		ceph_osdc_handle_map(&monc->client->osdc, msg);
-		break;
-
-	default:
-		pr_err("received unknown message type %d %s\n", type,
-		       ceph_msg_type_name(type));
-	}
-	ceph_msg_put(msg);
-}
-
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
-				      struct ceph_msg_header *hdr,
-				      int *skip)
-{
-	struct ceph_mon_client *monc = con->private;
-	int type = le16_to_cpu(hdr->type);
-	int front_len = le32_to_cpu(hdr->front_len);
-	struct ceph_msg *m = NULL;
-
-	*skip = 0;
-
-	switch (type) {
-	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		m = ceph_msg_get(monc->m_subscribe_ack);
-		break;
-	case CEPH_MSG_POOLOP_REPLY:
-	case CEPH_MSG_STATFS_REPLY:
-		return get_generic_reply(con, hdr, skip);
-	case CEPH_MSG_AUTH_REPLY:
-		m = ceph_msg_get(monc->m_auth_reply);
-		break;
-	case CEPH_MSG_MON_MAP:
-	case CEPH_MSG_MDS_MAP:
-	case CEPH_MSG_OSD_MAP:
-		m = ceph_msg_new(type, front_len, GFP_NOFS);
-		break;
-	}
-
-	if (!m) {
-		pr_info("alloc_msg unknown type %d\n", type);
-		*skip = 1;
-	}
-	return m;
-}
-
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
-	struct ceph_mon_client *monc = con->private;
-
-	if (!monc)
-		return;
-
-	dout("mon_fault\n");
-	mutex_lock(&monc->mutex);
-	if (!con->private)
-		goto out;
-
-	if (monc->con && !monc->hunting)
-		pr_info("mon%d %s session lost, "
-			"hunting for new mon\n", monc->cur_mon,
-			pr_addr(&monc->con->peer_addr.in_addr));
-
-	__close_session(monc);
-	if (!monc->hunting) {
-		/* start hunting */
-		monc->hunting = true;
-		__open_session(monc);
-	} else {
-		/* already hunting, let's wait a bit */
-		__schedule_delayed(monc);
-	}
-out:
-	mutex_unlock(&monc->mutex);
-}
-
-static const struct ceph_connection_operations mon_con_ops = {
-	.get = ceph_con_get,
-	.put = ceph_con_put,
-	.dispatch = dispatch,
-	.fault = mon_fault,
-	.alloc_msg = mon_alloc_msg,
-};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/rbtree.h>
-
-#include "messenger.h"
-
-struct ceph_client;
-struct ceph_mount_args;
-struct ceph_auth_client;
-
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
-	struct ceph_fsid fsid;
-	u32 epoch;
-	u32 num_mon;
-	struct ceph_entity_inst mon_inst[0];
-};
-
-struct ceph_mon_client;
-struct ceph_mon_generic_request;
-
-
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
-					 int newmon);
-
-/* a pending monitor request */
-struct ceph_mon_request {
-	struct ceph_mon_client *monc;
-	struct delayed_work delayed_work;
-	unsigned long delay;
-	ceph_monc_request_func_t do_request;
-};
-
-/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_generic_request {
-	struct kref kref;
-	u64 tid;
-	struct rb_node node;
-	int result;
-	void *buf;
-	int buf_len;
-	struct completion completion;
-	struct ceph_msg *request;  /* original request */
-	struct ceph_msg *reply;    /* and reply */
-};
-
-struct ceph_mon_client {
-	struct ceph_client *client;
-	struct ceph_monmap *monmap;
-
-	struct mutex mutex;
-	struct delayed_work delayed_work;
-
-	struct ceph_auth_client *auth;
-	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
-	int pending_auth;
-
-	bool hunting;
-	int cur_mon;                       /* last monitor i contacted */
-	unsigned long sub_sent, sub_renew_after;
-	struct ceph_connection *con;
-	bool have_fsid;
-
-	/* pending generic requests */
-	struct rb_root generic_request_tree;
-	int num_generic_requests;
-	u64 last_tid;
-
-	/* mds/osd map */
-	int want_next_osdmap; /* 1 = want, 2 = want+asked */
-	u32 have_osdmap, have_mdsmap;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_file;
-#endif
-};
-
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
-				struct ceph_entity_addr *addr);
-
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map.  We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
-			       struct ceph_statfs *buf);
-
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-				   u32 pool, u64 *snapid);
-
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-				   u32 pool, u64 snapid);
-
-#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-
-#include "msgpool.h"
-
-static void *alloc_fn(gfp_t gfp_mask, void *arg)
-{
-	struct ceph_msgpool *pool = arg;
-	void *p;
-
-	p = ceph_msg_new(0, pool->front_len, gfp_mask);
-	if (!p)
-		pr_err("msgpool %s alloc failed\n", pool->name);
-	return p;
-}
-
-static void free_fn(void *element, void *arg)
-{
-	ceph_msg_put(element);
-}
-
-int ceph_msgpool_init(struct ceph_msgpool *pool,
-		      int front_len, int size, bool blocking, const char *name)
-{
-	pool->front_len = front_len;
-	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-	if (!pool->pool)
-		return -ENOMEM;
-	pool->name = name;
-	return 0;
-}
-
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
-	mempool_destroy(pool->pool);
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
-				  int front_len)
-{
-	if (front_len > pool->front_len) {
-		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-		       pool->name, front_len, pool->front_len);
-		WARN_ON(1);
-
-		/* try to alloc a fresh message */
-		return ceph_msg_new(0, front_len, GFP_NOFS);
-	}
-
-	return mempool_alloc(pool->pool, GFP_NOFS);
-}
-
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
-	/* reset msg front_len; user may have changed it */
-	msg->front.iov_len = pool->front_len;
-	msg->hdr.front_len = cpu_to_le32(pool->front_len);
-
-	kref_init(&msg->kref);  /* retake single ref */
-}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-
-#include <linux/mempool.h>
-#include "messenger.h"
-
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
-	const char *name;
-	mempool_t *pool;
-	int front_len;          /* preallocated payload size */
-};
-
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-			     int front_len, int size, bool blocking,
-			     const char *name);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
-					 int front_len);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-
-#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CEPH_MSGR_H
-#define CEPH_MSGR_H
-
-/*
- * Data types for message passing layer used by Ceph.
- */
-
-#define CEPH_MON_PORT    6789  /* default monitor port */
-
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-
-/*
- * tcp connection banner.  include a protocol version. and adjust
- * whenever the wire protocol changes.  try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph v027"
-#define CEPH_BANNER_MAX_LEN 30
-
-
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
-       return (__s32)a - (__s32)b;
-}
-
-
-/*
- * entity_name -- logical name for a process participating in the
- * network, e.g. 'mds0' or 'osd3'.
- */
-struct ceph_entity_name {
-	__u8 type;      /* CEPH_ENTITY_TYPE_* */
-	__le64 num;
-} __attribute__ ((packed));
-
-#define CEPH_ENTITY_TYPE_MON    0x01
-#define CEPH_ENTITY_TYPE_MDS    0x02
-#define CEPH_ENTITY_TYPE_OSD    0x04
-#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_AUTH   0x20
-
-#define CEPH_ENTITY_TYPE_ANY    0xFF
-
-extern const char *ceph_entity_type_name(int type);
-
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
-	__le32 type;
-	__le32 nonce;  /* unique id for process (e.g. pid) */
-	struct sockaddr_storage in_addr;
-} __attribute__ ((packed));
-
-struct ceph_entity_inst {
-	struct ceph_entity_name name;
-	struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-
-
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
-					  incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
-					  with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
-					  with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
-#define CEPH_MSGR_TAG_MSG           7  /* message */
-#define CEPH_MSGR_TAG_ACK           8  /* message ack */
-#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
-#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
-#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
-
-
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
-	__le64 features;     /* supported feature bits */
-	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
-	__le32 global_seq;   /* count connections initiated by this host */
-	__le32 connect_seq;  /* count connections initiated in this session */
-	__le32 protocol_version;
-	__le32 authorizer_protocol;
-	__le32 authorizer_len;
-	__u8  flags;         /* CEPH_MSG_CONNECT_* */
-} __attribute__ ((packed));
-
-struct ceph_msg_connect_reply {
-	__u8 tag;
-	__le64 features;     /* feature bits for this session */
-	__le32 global_seq;
-	__le32 connect_seq;
-	__le32 protocol_version;
-	__le32 authorizer_len;
-	__u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
-
-
-/*
- * message header
- */
-struct ceph_msg_header_old {
-	__le64 seq;       /* message seq# for this session */
-	__le64 tid;       /* transaction id */
-	__le16 type;      /* message type */
-	__le16 priority;  /* priority.  higher value == higher priority */
-	__le16 version;   /* version of message encoding */
-
-	__le32 front_len; /* bytes in main payload */
-	__le32 middle_len;/* bytes in middle payload */
-	__le32 data_len;  /* bytes of data payload */
-	__le16 data_off;  /* sender: include full offset;
-			     receiver: mask against ~PAGE_MASK */
-
-	struct ceph_entity_inst src, orig_src;
-	__le32 reserved;
-	__le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-
-struct ceph_msg_header {
-	__le64 seq;       /* message seq# for this session */
-	__le64 tid;       /* transaction id */
-	__le16 type;      /* message type */
-	__le16 priority;  /* priority.  higher value == higher priority */
-	__le16 version;   /* version of message encoding */
-
-	__le32 front_len; /* bytes in main payload */
-	__le32 middle_len;/* bytes in middle payload */
-	__le32 data_len;  /* bytes of data payload */
-	__le16 data_off;  /* sender: include full offset;
-			     receiver: mask against ~PAGE_MASK */
-
-	struct ceph_entity_name src;
-	__le32 reserved;
-	__le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-
-#define CEPH_MSG_PRIO_LOW     64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH    196
-#define CEPH_MSG_PRIO_HIGHEST 255
-
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
-	__le32 front_crc, middle_crc, data_crc;
-	__u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
-#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
-
-
-#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index 0edb43f1ef26..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1771 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#ifdef CONFIG_BLOCK
-#include <linux/bio.h>
-#endif
-
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
-
-#define OSD_OP_FRONT_LEN	4096
-#define OSD_OPREPLY_FRONT_LEN	512
-
-static const struct ceph_connection_operations osd_con_ops;
-static int __kick_requests(struct ceph_osd_client *osdc,
-			  struct ceph_osd *kickosd);
-
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-
-static int op_needs_trail(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_GETXATTR:
-	case CEPH_OSD_OP_SETXATTR:
-	case CEPH_OSD_OP_CMPXATTR:
-	case CEPH_OSD_OP_CALL:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-static int op_has_extent(int op)
-{
-	return (op == CEPH_OSD_OP_READ ||
-		op == CEPH_OSD_OP_WRITE);
-}
-
-void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-			struct ceph_file_layout *layout,
-			u64 snapid,
-			u64 off, u64 *plen, u64 *bno,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
-{
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
-
-	reqhead->snapid = cpu_to_le64(snapid);
-
-	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, plen, bno,
-				      &objoff, &objlen);
-	if (*plen < orig_len)
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
-
-	if (op_has_extent(op->op)) {
-		op->extent.offset = objoff;
-		op->extent.length = objlen;
-	}
-	req->r_num_pages = calc_pages_for(off, *plen);
-
-	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
-	     *bno, objoff, objlen, req->r_num_pages);
-
-}
-
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly.  shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
-			struct ceph_vino vino,
-			struct ceph_file_layout *layout,
-			u64 off, u64 *plen,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
-{
-	u64 bno;
-
-	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-			     plen, &bno, req, op);
-
-	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
-	req->r_oid_len = strlen(req->r_oid);
-}
-
-/*
- * requests
- */
-void ceph_osdc_release_request(struct kref *kref)
-{
-	struct ceph_osd_request *req = container_of(kref,
-						    struct ceph_osd_request,
-						    r_kref);
-
-	if (req->r_request)
-		ceph_msg_put(req->r_request);
-	if (req->r_reply)
-		ceph_msg_put(req->r_reply);
-	if (req->r_con_filling_msg) {
-		dout("release_request revoking pages %p from con %p\n",
-		     req->r_pages, req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg,
-				      req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
-	}
-	if (req->r_own_pages)
-		ceph_release_page_vector(req->r_pages,
-					 req->r_num_pages);
-#ifdef CONFIG_BLOCK
-	if (req->r_bio)
-		bio_put(req->r_bio);
-#endif
-	ceph_put_snap_context(req->r_snapc);
-	if (req->r_trail) {
-		ceph_pagelist_release(req->r_trail);
-		kfree(req->r_trail);
-	}
-	if (req->r_mempool)
-		mempool_free(req, req->r_osdc->req_mempool);
-	else
-		kfree(req);
-}
-
-static int op_needs_trail(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_GETXATTR:
-	case CEPH_OSD_OP_SETXATTR:
-	case CEPH_OSD_OP_CMPXATTR:
-	case CEPH_OSD_OP_CALL:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
-{
-	int i = 0;
-
-	if (needs_trail)
-		*needs_trail = 0;
-	while (ops[i].op) {
-		if (needs_trail && op_needs_trail(ops[i].op))
-			*needs_trail = 1;
-		i++;
-	}
-
-	return i;
-}
-
-struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
-					       int flags,
-					       struct ceph_snap_context *snapc,
-					       struct ceph_osd_req_op *ops,
-					       bool use_mempool,
-					       gfp_t gfp_flags,
-					       struct page **pages,
-					       struct bio *bio)
-{
-	struct ceph_osd_request *req;
-	struct ceph_msg *msg;
-	int needs_trail;
-	int num_op = get_num_ops(ops, &needs_trail);
-	size_t msg_size = sizeof(struct ceph_osd_request_head);
-
-	msg_size += num_op*sizeof(struct ceph_osd_op);
-
-	if (use_mempool) {
-		req = mempool_alloc(osdc->req_mempool, gfp_flags);
-		memset(req, 0, sizeof(*req));
-	} else {
-		req = kzalloc(sizeof(*req), gfp_flags);
-	}
-	if (req == NULL)
-		return NULL;
-
-	req->r_osdc = osdc;
-	req->r_mempool = use_mempool;
-
-	kref_init(&req->r_kref);
-	init_completion(&req->r_completion);
-	init_completion(&req->r_safe_completion);
-	INIT_LIST_HEAD(&req->r_unsafe_item);
-	req->r_flags = flags;
-
-	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
-	/* create reply message */
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	req->r_reply = msg;
-
-	/* allocate space for the trailing data */
-	if (needs_trail) {
-		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
-		if (!req->r_trail) {
-			ceph_osdc_put_request(req);
-			return NULL;
-		}
-		ceph_pagelist_init(req->r_trail);
-	}
-	/* create request message; allow space for oid */
-	msg_size += 40;
-	if (snapc)
-		msg_size += sizeof(u64) * snapc->num_snaps;
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-
-	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
-	memset(msg->front.iov_base, 0, msg->front.iov_len);
-
-	req->r_request = msg;
-	req->r_pages = pages;
-#ifdef CONFIG_BLOCK
-	if (bio) {
-		req->r_bio = bio;
-		bio_get(req->r_bio);
-	}
-#endif
-
-	return req;
-}
-
-static void osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst,
-			      struct ceph_osd_req_op *src)
-{
-	dst->op = cpu_to_le16(src->op);
-
-	switch (dst->op) {
-	case CEPH_OSD_OP_READ:
-	case CEPH_OSD_OP_WRITE:
-		dst->extent.offset =
-			cpu_to_le64(src->extent.offset);
-		dst->extent.length =
-			cpu_to_le64(src->extent.length);
-		dst->extent.truncate_size =
-			cpu_to_le64(src->extent.truncate_size);
-		dst->extent.truncate_seq =
-			cpu_to_le32(src->extent.truncate_seq);
-		break;
-
-	case CEPH_OSD_OP_GETXATTR:
-	case CEPH_OSD_OP_SETXATTR:
-	case CEPH_OSD_OP_CMPXATTR:
-		BUG_ON(!req->r_trail);
-
-		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
-		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
-		dst->xattr.cmp_op = src->xattr.cmp_op;
-		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		ceph_pagelist_append(req->r_trail, src->xattr.name,
-				     src->xattr.name_len);
-		ceph_pagelist_append(req->r_trail, src->xattr.val,
-				     src->xattr.value_len);
-		break;
-	case CEPH_OSD_OP_CALL:
-		BUG_ON(!req->r_trail);
-
-		dst->cls.class_len = src->cls.class_len;
-		dst->cls.method_len = src->cls.method_len;
-		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
-
-		ceph_pagelist_append(req->r_trail, src->cls.class_name,
-				     src->cls.class_len);
-		ceph_pagelist_append(req->r_trail, src->cls.method_name,
-				     src->cls.method_len);
-		ceph_pagelist_append(req->r_trail, src->cls.indata,
-				     src->cls.indata_len);
-		break;
-	case CEPH_OSD_OP_ROLLBACK:
-		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
-		break;
-	case CEPH_OSD_OP_STARTSYNC:
-		break;
-	default:
-		pr_err("unrecognized osd opcode %d\n", dst->op);
-		WARN_ON(1);
-		break;
-	}
-	dst->payload_len = cpu_to_le32(src->payload_len);
-}
-
-/*
- * build new request AND message
- *
- */
-void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 *plen,
-			     struct ceph_osd_req_op *src_ops,
-			     struct ceph_snap_context *snapc,
-			     struct timespec *mtime,
-			     const char *oid,
-			     int oid_len)
-{
-	struct ceph_msg *msg = req->r_request;
-	struct ceph_osd_request_head *head;
-	struct ceph_osd_req_op *src_op;
-	struct ceph_osd_op *op;
-	void *p;
-	int num_op = get_num_ops(src_ops, NULL);
-	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-	int flags = req->r_flags;
-	u64 data_len = 0;
-	int i;
-
-	head = msg->front.iov_base;
-	op = (void *)(head + 1);
-	p = (void *)(op + num_op);
-
-	req->r_snapc = ceph_get_snap_context(snapc);
-
-	head->client_inc = cpu_to_le32(1); /* always, for now. */
-	head->flags = cpu_to_le32(flags);
-	if (flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(&head->mtime, mtime);
-	head->num_ops = cpu_to_le16(num_op);
-
-
-	/* fill in oid */
-	head->object_len = cpu_to_le32(oid_len);
-	memcpy(p, oid, oid_len);
-	p += oid_len;
-
-	src_op = src_ops;
-	while (src_op->op) {
-		osd_req_encode_op(req, op, src_op);
-		src_op++;
-		op++;
-	}
-
-	if (req->r_trail)
-		data_len += req->r_trail->length;
-
-	if (snapc) {
-		head->snap_seq = cpu_to_le64(snapc->seq);
-		head->num_snaps = cpu_to_le32(snapc->num_snaps);
-		for (i = 0; i < snapc->num_snaps; i++) {
-			put_unaligned_le64(snapc->snaps[i], p);
-			p += sizeof(u64);
-		}
-	}
-
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
-	} else if (data_len) {
-		req->r_request->hdr.data_off = 0;
-		req->r_request->hdr.data_len = cpu_to_le32(data_len);
-	}
-
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
-	return;
-}
-
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-					       struct ceph_file_layout *layout,
-					       struct ceph_vino vino,
-					       u64 off, u64 *plen,
-					       int opcode, int flags,
-					       struct ceph_snap_context *snapc,
-					       int do_sync,
-					       u32 truncate_seq,
-					       u64 truncate_size,
-					       struct timespec *mtime,
-					       bool use_mempool, int num_reply)
-{
-	struct ceph_osd_req_op ops[3];
-	struct ceph_osd_request *req;
-
-	ops[0].op = opcode;
-	ops[0].extent.truncate_seq = truncate_seq;
-	ops[0].extent.truncate_size = truncate_size;
-	ops[0].payload_len = 0;
-
-	if (do_sync) {
-		ops[1].op = CEPH_OSD_OP_STARTSYNC;
-		ops[1].payload_len = 0;
-		ops[2].op = 0;
-	} else
-		ops[1].op = 0;
-
-	req = ceph_osdc_alloc_request(osdc, flags,
-					 snapc, ops,
-					 use_mempool,
-					 GFP_NOFS, NULL, NULL);
-	if (IS_ERR(req))
-		return req;
-
-	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req, ops);
-	req->r_file_layout = *layout;  /* keep a copy */
-
-	ceph_osdc_build_request(req, off, plen, ops,
-				snapc,
-				mtime,
-				req->r_oid, req->r_oid_len);
-
-	return req;
-}
-
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *new)
-{
-	struct rb_node **p = &osdc->requests.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_osd_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-						 u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-		    u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid) {
-			if (!n->rb_left)
-				return req;
-			n = n->rb_left;
-		} else if (tid > req->r_tid) {
-			n = n->rb_right;
-		} else {
-			return req;
-		}
-	}
-	return NULL;
-}
-
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-
-	if (!osd)
-		return;
-	dout("osd_reset osd%d\n", osd->o_osd);
-	osdc = osd->o_osdc;
-	down_read(&osdc->map_sem);
-	kick_requests(osdc, osd);
-	up_read(&osdc->map_sem);
-}
-
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd *osd;
-
-	osd = kzalloc(sizeof(*osd), GFP_NOFS);
-	if (!osd)
-		return NULL;
-
-	atomic_set(&osd->o_ref, 1);
-	osd->o_osdc = osdc;
-	INIT_LIST_HEAD(&osd->o_requests);
-	INIT_LIST_HEAD(&osd->o_osd_lru);
-	osd->o_incarnation = 1;
-
-	ceph_con_init(osdc->client->msgr, &osd->o_con);
-	osd->o_con.private = osd;
-	osd->o_con.ops = &osd_con_ops;
-	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-
-	INIT_LIST_HEAD(&osd->o_keepalive_item);
-	return osd;
-}
-
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
-	if (atomic_inc_not_zero(&osd->o_ref)) {
-		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-		     atomic_read(&osd->o_ref));
-		return osd;
-	} else {
-		dout("get_osd %p FAIL\n", osd);
-		return NULL;
-	}
-}
-
-static void put_osd(struct ceph_osd *osd)
-{
-	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-	     atomic_read(&osd->o_ref) - 1);
-	if (atomic_dec_and_test(&osd->o_ref)) {
-		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-
-		if (osd->o_authorizer)
-			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
-		kfree(osd);
-	}
-}
-
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("__remove_osd %p\n", osd);
-	BUG_ON(!list_empty(&osd->o_requests));
-	rb_erase(&osd->o_node, &osdc->osds);
-	list_del_init(&osd->o_osd_lru);
-	ceph_con_close(&osd->o_con);
-	put_osd(osd);
-}
-
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-			      struct ceph_osd *osd)
-{
-	dout("__move_osd_to_lru %p\n", osd);
-	BUG_ON(!list_empty(&osd->o_osd_lru));
-	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
-}
-
-static void __remove_osd_from_lru(struct ceph_osd *osd)
-{
-	dout("__remove_osd_from_lru %p\n", osd);
-	if (!list_empty(&osd->o_osd_lru))
-		list_del_init(&osd->o_osd_lru);
-}
-
-static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
-{
-	struct ceph_osd *osd, *nosd;
-
-	dout("__remove_old_osds %p\n", osdc);
-	mutex_lock(&osdc->request_mutex);
-	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-		if (!remove_all && time_before(jiffies, osd->lru_ttl))
-			break;
-		__remove_osd(osdc, osd);
-	}
-	mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * reset osd connect
- */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	struct ceph_osd_request *req;
-	int ret = 0;
-
-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-	if (list_empty(&osd->o_requests)) {
-		__remove_osd(osdc, osd);
-	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-			  &osd->o_con.peer_addr,
-			  sizeof(osd->o_con.peer_addr)) == 0 &&
-		   !ceph_con_opened(&osd->o_con)) {
-		dout(" osd addr hasn't changed and connection never opened,"
-		     " letting msgr retry");
-		/* touch each r_stamp for handle_timeout()'s benfit */
-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
-			req->r_stamp = jiffies;
-		ret = -EAGAIN;
-	} else {
-		ceph_con_close(&osd->o_con);
-		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
-		osd->o_incarnation++;
-	}
-	return ret;
-}
-
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-	struct rb_node **p = &osdc->osds.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd *osd = NULL;
-
-	while (*p) {
-		parent = *p;
-		osd = rb_entry(parent, struct ceph_osd, o_node);
-		if (new->o_osd < osd->o_osd)
-			p = &(*p)->rb_left;
-		else if (new->o_osd > osd->o_osd)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->o_node, parent, p);
-	rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-	struct ceph_osd *osd;
-	struct rb_node *n = osdc->osds.rb_node;
-
-	while (n) {
-		osd = rb_entry(n, struct ceph_osd, o_node);
-		if (o < osd->o_osd)
-			n = n->rb_left;
-		else if (o > osd->o_osd)
-			n = n->rb_right;
-		else
-			return osd;
-	}
-	return NULL;
-}
-
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
-{
-	schedule_delayed_work(&osdc->timeout_work,
-			osdc->client->mount_args->osd_keepalive_timeout * HZ);
-}
-
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
-{
-	cancel_delayed_work(&osdc->timeout_work);
-}
-
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *req)
-{
-	mutex_lock(&osdc->request_mutex);
-	req->r_tid = ++osdc->last_tid;
-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-	INIT_LIST_HEAD(&req->r_req_lru_item);
-
-	dout("register_request %p tid %lld\n", req, req->r_tid);
-	__insert_request(osdc, req);
-	ceph_osdc_get_request(req);
-	osdc->num_requests++;
-
-	if (osdc->num_requests == 1) {
-		dout(" first request, scheduling timeout\n");
-		__schedule_osd_timeout(osdc);
-	}
-	mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req)
-{
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	rb_erase(&req->r_node, &osdc->requests);
-	osdc->num_requests--;
-
-	if (req->r_osd) {
-		/* make sure the original request isn't in flight. */
-		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-
-		list_del_init(&req->r_osd_item);
-		if (list_empty(&req->r_osd->o_requests))
-			__move_osd_to_lru(osdc, req->r_osd);
-		req->r_osd = NULL;
-	}
-
-	ceph_osdc_put_request(req);
-
-	list_del_init(&req->r_req_lru_item);
-	if (osdc->num_requests == 0) {
-		dout(" no requests, canceling timeout\n");
-		__cancel_osd_timeout(osdc);
-	}
-}
-
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
-	if (req->r_sent && req->r_osd) {
-		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-		req->r_sent = 0;
-	}
-	list_del_init(&req->r_req_lru_item);
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
-		      struct ceph_osd_request *req)
-{
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_pg pgid;
-	int acting[CEPH_PG_MAX_SIZE];
-	int o = -1, num = 0;
-	int err;
-
-	dout("map_osds %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
-				      &req->r_file_layout, osdc->osdmap);
-	if (err)
-		return err;
-	pgid = reqhead->layout.ol_pgid;
-	req->r_pgid = pgid;
-
-	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
-	if (err > 0) {
-		o = acting[0];
-		num = err;
-	}
-
-	if ((req->r_osd && req->r_osd->o_osd == o &&
-	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == num &&
-	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-	    (req->r_osd == NULL && o == -1))
-		return 0;  /* no change */
-
-	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
-	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-	req->r_num_pg_osds = num;
-
-	if (req->r_osd) {
-		__cancel_request(req);
-		list_del_init(&req->r_osd_item);
-		req->r_osd = NULL;
-	}
-
-	req->r_osd = __lookup_osd(osdc, o);
-	if (!req->r_osd && o >= 0) {
-		err = -ENOMEM;
-		req->r_osd = create_osd(osdc);
-		if (!req->r_osd)
-			goto out;
-
-		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
-		req->r_osd->o_osd = o;
-		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
-		__insert_osd(osdc, req->r_osd);
-
-		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
-	}
-
-	if (req->r_osd) {
-		__remove_osd_from_lru(req->r_osd);
-		list_add(&req->r_osd_item, &req->r_osd->o_requests);
-	}
-	err = 1;   /* osd or pg changed */
-
-out:
-	return err;
-}
-
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
-			  struct ceph_osd_request *req)
-{
-	struct ceph_osd_request_head *reqhead;
-	int err;
-
-	err = __map_osds(osdc, req);
-	if (err < 0)
-		return err;
-	if (req->r_osd == NULL) {
-		dout("send_request %p no up osds in pg\n", req);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-		return 0;
-	}
-
-	dout("send_request %p tid %llu to osd%d flags %d\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-
-	reqhead = req->r_request->front.iov_base;
-	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-	reqhead->reassert_version = req->r_reassert_version;
-
-	req->r_stamp = jiffies;
-	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-
-	ceph_msg_get(req->r_request); /* send consumes a ref */
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
-	req->r_sent = req->r_osd->o_incarnation;
-	return 0;
-}
-
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
-	struct ceph_osd_client *osdc =
-		container_of(work, struct ceph_osd_client, timeout_work.work);
-	struct ceph_osd_request *req, *last_req = NULL;
-	struct ceph_osd *osd;
-	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
-	unsigned long keepalive =
-		osdc->client->mount_args->osd_keepalive_timeout * HZ;
-	unsigned long last_stamp = 0;
-	struct rb_node *p;
-	struct list_head slow_osds;
-
-	dout("timeout\n");
-	down_read(&osdc->map_sem);
-
-	ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		if (req->r_resend) {
-			int err;
-
-			dout("osdc resending prev failed %lld\n", req->r_tid);
-			err = __send_request(osdc, req);
-			if (err)
-				dout("osdc failed again on %lld\n", req->r_tid);
-			else
-				req->r_resend = false;
-			continue;
-		}
-	}
-
-	/*
-	 * reset osds that appear to be _really_ unresponsive.  this
-	 * is a failsafe measure.. we really shouldn't be getting to
-	 * this point if the system is working properly.  the monitors
-	 * should mark the osd as failed and we should find out about
-	 * it from an updated osd map.
-	 */
-	while (timeout && !list_empty(&osdc->req_lru)) {
-		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-				 r_req_lru_item);
-
-		if (time_before(jiffies, req->r_stamp + timeout))
-			break;
-
-		BUG_ON(req == last_req && req->r_stamp == last_stamp);
-		last_req = req;
-		last_stamp = req->r_stamp;
-
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-			   req->r_tid, osd->o_osd);
-		__kick_requests(osdc, osd);
-	}
-
-	/*
-	 * ping osds that are a bit slow.  this ensures that if there
-	 * is a break in the TCP connection we will notice, and reopen
-	 * a connection with that osd (from the fault callback).
-	 */
-	INIT_LIST_HEAD(&slow_osds);
-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_stamp + keepalive))
-			break;
-
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
-		     req->r_tid, osd->o_osd);
-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
-	}
-	while (!list_empty(&slow_osds)) {
-		osd = list_entry(slow_osds.next, struct ceph_osd,
-				 o_keepalive_item);
-		list_del_init(&osd->o_keepalive_item);
-		ceph_con_keepalive(&osd->o_con);
-	}
-
-	__schedule_osd_timeout(osdc);
-	mutex_unlock(&osdc->request_mutex);
-
-	up_read(&osdc->map_sem);
-}
-
-static void handle_osds_timeout(struct work_struct *work)
-{
-	struct ceph_osd_client *osdc =
-		container_of(work, struct ceph_osd_client,
-			     osds_timeout_work.work);
-	unsigned long delay =
-		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
-
-	dout("osds timeout\n");
-	down_read(&osdc->map_sem);
-	remove_old_osds(osdc, 0);
-	up_read(&osdc->map_sem);
-
-	schedule_delayed_work(&osdc->osds_timeout_work,
-			      round_jiffies_relative(delay));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
-			 struct ceph_connection *con)
-{
-	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
-	struct ceph_osd_request *req;
-	u64 tid;
-	int numops, object_len, flags;
-	s32 result;
-
-	tid = le64_to_cpu(msg->hdr.tid);
-	if (msg->front.iov_len < sizeof(*rhead))
-		goto bad;
-	numops = le32_to_cpu(rhead->num_ops);
-	object_len = le32_to_cpu(rhead->object_len);
-	result = le32_to_cpu(rhead->result);
-	if (msg->front.iov_len != sizeof(*rhead) + object_len +
-	    numops * sizeof(struct ceph_osd_op))
-		goto bad;
-	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
-
-	/* lookup */
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
-	if (req == NULL) {
-		dout("handle_reply tid %llu dne\n", tid);
-		mutex_unlock(&osdc->request_mutex);
-		return;
-	}
-	ceph_osdc_get_request(req);
-	flags = le32_to_cpu(rhead->flags);
-
-	/*
-	 * if this connection filled our message, drop our reference now, to
-	 * avoid a (safe but slower) revoke later.
-	 */
-	if (req->r_con_filling_msg == con && req->r_reply == msg) {
-		dout(" dropping con_filling_msg ref %p\n", con);
-		req->r_con_filling_msg = NULL;
-		ceph_con_put(con);
-	}
-
-	if (!req->r_got_reply) {
-		unsigned bytes;
-
-		req->r_result = le32_to_cpu(rhead->result);
-		bytes = le32_to_cpu(msg->hdr.data_len);
-		dout("handle_reply result %d bytes %d\n", req->r_result,
-		     bytes);
-		if (req->r_result == 0)
-			req->r_result = bytes;
-
-		/* in case this is a write and we need to replay, */
-		req->r_reassert_version = rhead->reassert_version;
-
-		req->r_got_reply = 1;
-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-		dout("handle_reply tid %llu dup ack\n", tid);
-		mutex_unlock(&osdc->request_mutex);
-		goto done;
-	}
-
-	dout("handle_reply tid %llu flags %d\n", tid, flags);
-
-	/* either this is a read, or we got the safe response */
-	if (result < 0 ||
-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-		__unregister_request(osdc, req);
-
-	mutex_unlock(&osdc->request_mutex);
-
-	if (req->r_callback)
-		req->r_callback(req, msg);
-	else
-		complete_all(&req->r_completion);
-
-	if (flags & CEPH_OSD_FLAG_ONDISK) {
-		if (req->r_safe_callback)
-			req->r_safe_callback(req, msg);
-		complete_all(&req->r_safe_completion);  /* fsync waiter */
-	}
-
-done:
-	ceph_osdc_put_request(req);
-	return;
-
-bad:
-	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-	       (int)sizeof(*rhead));
-	ceph_msg_dump(msg);
-}
-
-
-static int __kick_requests(struct ceph_osd_client *osdc,
-			  struct ceph_osd *kickosd)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *p, *n;
-	int needmap = 0;
-	int err;
-
-	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
-	if (kickosd) {
-		err = __reset_osd(osdc, kickosd);
-		if (err == -EAGAIN)
-			return 1;
-	} else {
-		for (p = rb_first(&osdc->osds); p; p = n) {
-			struct ceph_osd *osd =
-				rb_entry(p, struct ceph_osd, o_node);
-
-			n = rb_next(p);
-			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-			    memcmp(&osd->o_con.peer_addr,
-				   ceph_osd_addr(osdc->osdmap,
-						 osd->o_osd),
-				   sizeof(struct ceph_entity_addr)) != 0)
-				__reset_osd(osdc, osd);
-		}
-	}
-
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		if (req->r_resend) {
-			dout(" r_resend set on tid %llu\n", req->r_tid);
-			__cancel_request(req);
-			goto kick;
-		}
-		if (req->r_osd && kickosd == req->r_osd) {
-			__cancel_request(req);
-			goto kick;
-		}
-
-		err = __map_osds(osdc, req);
-		if (err == 0)
-			continue;  /* no change */
-		if (err < 0) {
-			/*
-			 * FIXME: really, we should set the request
-			 * error and fail if this isn't a 'nofail'
-			 * request, but that's a fair bit more
-			 * complicated to do.  So retry!
-			 */
-			dout(" setting r_resend on %llu\n", req->r_tid);
-			req->r_resend = true;
-			continue;
-		}
-		if (req->r_osd == NULL) {
-			dout("tid %llu maps to no valid osd\n", req->r_tid);
-			needmap++;  /* request a newer map */
-			continue;
-		}
-
-kick:
-		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-		     req->r_osd ? req->r_osd->o_osd : -1);
-		req->r_flags |= CEPH_OSD_FLAG_RETRY;
-		err = __send_request(osdc, req);
-		if (err) {
-			dout(" setting r_resend on %llu\n", req->r_tid);
-			req->r_resend = true;
-		}
-	}
-
-	return needmap;
-}
-
-/*
- * Resubmit osd requests whose osd or osd address has changed.  Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
-			  struct ceph_osd *kickosd)
-{
-	int needmap;
-
-	mutex_lock(&osdc->request_mutex);
-	needmap = __kick_requests(osdc, kickosd);
-	mutex_unlock(&osdc->request_mutex);
-
-	if (needmap) {
-		dout("%d requests for down osds, need new map\n", needmap);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-	}
-
-}
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster.  Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
-	void *p, *end, *next;
-	u32 nr_maps, maplen;
-	u32 epoch;
-	struct ceph_osdmap *newmap = NULL, *oldmap;
-	int err;
-	struct ceph_fsid fsid;
-
-	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
-
-	/* verify fsid */
-	ceph_decode_need(&p, end, sizeof(fsid), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_check_fsid(osdc->client, &fsid) < 0)
-		return;
-
-	down_write(&osdc->map_sem);
-
-	/* incremental maps */
-	ceph_decode_32_safe(&p, end, nr_maps, bad);
-	dout(" %d inc maps\n", nr_maps);
-	while (nr_maps > 0) {
-		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-		epoch = ceph_decode_32(&p);
-		maplen = ceph_decode_32(&p);
-		ceph_decode_need(&p, end, maplen, bad);
-		next = p + maplen;
-		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
-			dout("applying incremental map %u len %d\n",
-			     epoch, maplen);
-			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap,
-							  osdc->client->msgr);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
-				goto bad;
-			}
-			BUG_ON(!newmap);
-			if (newmap != osdc->osdmap) {
-				ceph_osdmap_destroy(osdc->osdmap);
-				osdc->osdmap = newmap;
-			}
-		} else {
-			dout("ignoring incremental map %u len %d\n",
-			     epoch, maplen);
-		}
-		p = next;
-		nr_maps--;
-	}
-	if (newmap)
-		goto done;
-
-	/* full maps */
-	ceph_decode_32_safe(&p, end, nr_maps, bad);
-	dout(" %d full maps\n", nr_maps);
-	while (nr_maps) {
-		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-		epoch = ceph_decode_32(&p);
-		maplen = ceph_decode_32(&p);
-		ceph_decode_need(&p, end, maplen, bad);
-		if (nr_maps > 1) {
-			dout("skipping non-latest full map %u len %d\n",
-			     epoch, maplen);
-		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
-			dout("skipping full map %u len %d, "
-			     "older than our %u\n", epoch, maplen,
-			     osdc->osdmap->epoch);
-		} else {
-			dout("taking full map %u len %d\n", epoch, maplen);
-			newmap = osdmap_decode(&p, p+maplen);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
-				goto bad;
-			}
-			BUG_ON(!newmap);
-			oldmap = osdc->osdmap;
-			osdc->osdmap = newmap;
-			if (oldmap)
-				ceph_osdmap_destroy(oldmap);
-		}
-		p += maplen;
-		nr_maps--;
-	}
-
-done:
-	downgrade_write(&osdc->map_sem);
-	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
-	if (newmap)
-		kick_requests(osdc, NULL);
-	up_read(&osdc->map_sem);
-	wake_up_all(&osdc->client->auth_wq);
-	return;
-
-bad:
-	pr_err("osdc handle_map corrupt msg\n");
-	ceph_msg_dump(msg);
-	up_write(&osdc->map_sem);
-	return;
-}
-
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-			    struct ceph_osd_request *req,
-			    bool nofail)
-{
-	int rc = 0;
-
-	req->r_request->pages = req->r_pages;
-	req->r_request->nr_pages = req->r_num_pages;
-#ifdef CONFIG_BLOCK
-	req->r_request->bio = req->r_bio;
-#endif
-	req->r_request->trail = req->r_trail;
-
-	register_request(osdc, req);
-
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	/*
-	 * a racing kick_requests() may have sent the message for us
-	 * while we dropped request_mutex above, so only send now if
-	 * the request still han't been touched yet.
-	 */
-	if (req->r_sent == 0) {
-		rc = __send_request(osdc, req);
-		if (rc) {
-			if (nofail) {
-				dout("osdc_start_request failed send, "
-				     " marking %lld\n", req->r_tid);
-				req->r_resend = true;
-				rc = 0;
-			} else {
-				__unregister_request(osdc, req);
-			}
-		}
-	}
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-	return rc;
-}
-
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
-{
-	int rc;
-
-	rc = wait_for_completion_interruptible(&req->r_completion);
-	if (rc < 0) {
-		mutex_lock(&osdc->request_mutex);
-		__cancel_request(req);
-		__unregister_request(osdc, req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
-		return rc;
-	}
-
-	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
-	return req->r_result;
-}
-
-/*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd_request *req;
-	u64 last_tid, next_tid = 0;
-
-	mutex_lock(&osdc->request_mutex);
-	last_tid = osdc->last_tid;
-	while (1) {
-		req = __lookup_request_ge(osdc, next_tid);
-		if (!req)
-			break;
-		if (req->r_tid > last_tid)
-			break;
-
-		next_tid = req->r_tid + 1;
-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-			continue;
-
-		ceph_osdc_get_request(req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("sync waiting on tid %llu (last is %llu)\n",
-		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&osdc->request_mutex);
-		ceph_osdc_put_request(req);
-	}
-	mutex_unlock(&osdc->request_mutex);
-	dout("sync done (thru tid %llu)\n", last_tid);
-}
-
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
-	int err;
-
-	dout("init\n");
-	osdc->client = client;
-	osdc->osdmap = NULL;
-	init_rwsem(&osdc->map_sem);
-	init_completion(&osdc->map_waiters);
-	osdc->last_requested_map = 0;
-	mutex_init(&osdc->request_mutex);
-	osdc->last_tid = 0;
-	osdc->osds = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->osd_lru);
-	osdc->requests = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->req_lru);
-	osdc->num_requests = 0;
-	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
-	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-
-	schedule_delayed_work(&osdc->osds_timeout_work,
-	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
-
-	err = -ENOMEM;
-	osdc->req_mempool = mempool_create_kmalloc_pool(10,
-					sizeof(struct ceph_osd_request));
-	if (!osdc->req_mempool)
-		goto out;
-
-	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
-				"osd_op");
-	if (err < 0)
-		goto out_mempool;
-	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-				OSD_OPREPLY_FRONT_LEN, 10, true,
-				"osd_op_reply");
-	if (err < 0)
-		goto out_msgpool;
-	return 0;
-
-out_msgpool:
-	ceph_msgpool_destroy(&osdc->msgpool_op);
-out_mempool:
-	mempool_destroy(osdc->req_mempool);
-out:
-	return err;
-}
-
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
-	cancel_delayed_work_sync(&osdc->timeout_work);
-	cancel_delayed_work_sync(&osdc->osds_timeout_work);
-	if (osdc->osdmap) {
-		ceph_osdmap_destroy(osdc->osdmap);
-		osdc->osdmap = NULL;
-	}
-	remove_old_osds(osdc, 1);
-	mempool_destroy(osdc->req_mempool);
-	ceph_msgpool_destroy(&osdc->msgpool_op);
-	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-			struct ceph_vino vino, struct ceph_file_layout *layout,
-			u64 off, u64 *plen,
-			u32 truncate_seq, u64 truncate_size,
-			struct page **pages, int num_pages)
-{
-	struct ceph_osd_request *req;
-	int rc = 0;
-
-	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-	     vino.snap, off, *plen);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
-				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, 0, truncate_seq, truncate_size, NULL,
-				    false, 1);
-	if (!req)
-		return -ENOMEM;
-
-	/* it may be a short read due to an object boundary */
-	req->r_pages = pages;
-
-	dout("readpages  final extent is %llu~%llu (%d pages)\n",
-	     off, *plen, req->r_num_pages);
-
-	rc = ceph_osdc_start_request(osdc, req, false);
-	if (!rc)
-		rc = ceph_osdc_wait_request(osdc, req);
-
-	ceph_osdc_put_request(req);
-	dout("readpages result %d\n", rc);
-	return rc;
-}
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
-			 struct ceph_file_layout *layout,
-			 struct ceph_snap_context *snapc,
-			 u64 off, u64 len,
-			 u32 truncate_seq, u64 truncate_size,
-			 struct timespec *mtime,
-			 struct page **pages, int num_pages,
-			 int flags, int do_sync, bool nofail)
-{
-	struct ceph_osd_request *req;
-	int rc = 0;
-
-	BUG_ON(vino.snap != CEPH_NOSNAP);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
-				    CEPH_OSD_OP_WRITE,
-				    flags | CEPH_OSD_FLAG_ONDISK |
-					    CEPH_OSD_FLAG_WRITE,
-				    snapc, do_sync,
-				    truncate_seq, truncate_size, mtime,
-				    nofail, 1);
-	if (!req)
-		return -ENOMEM;
-
-	/* it may be a short write due to an object boundary */
-	req->r_pages = pages;
-	dout("writepages %llu~%llu (%d pages)\n", off, len,
-	     req->r_num_pages);
-
-	rc = ceph_osdc_start_request(osdc, req, nofail);
-	if (!rc)
-		rc = ceph_osdc_wait_request(osdc, req);
-
-	ceph_osdc_put_request(req);
-	if (rc == 0)
-		rc = len;
-	dout("writepages result %d\n", rc);
-	return rc;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-	int type = le16_to_cpu(msg->hdr.type);
-
-	if (!osd)
-		goto out;
-	osdc = osd->o_osdc;
-
-	switch (type) {
-	case CEPH_MSG_OSD_MAP:
-		ceph_osdc_handle_map(osdc, msg);
-		break;
-	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg, con);
-		break;
-
-	default:
-		pr_err("received unknown message type %d %s\n", type,
-		       ceph_msg_type_name(type));
-	}
-out:
-	ceph_msg_put(msg);
-}
-
-/*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
- */
-static struct ceph_msg *get_reply(struct ceph_connection *con,
-				  struct ceph_msg_header *hdr,
-				  int *skip)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc = osd->o_osdc;
-	struct ceph_msg *m;
-	struct ceph_osd_request *req;
-	int front = le32_to_cpu(hdr->front_len);
-	int data_len = le32_to_cpu(hdr->data_len);
-	u64 tid;
-
-	tid = le64_to_cpu(hdr->tid);
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
-	if (!req) {
-		*skip = 1;
-		m = NULL;
-		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-			osd->o_osd);
-		goto out;
-	}
-
-	if (req->r_con_filling_msg) {
-		dout("get_reply revoking msg %p from old con %p\n",
-		     req->r_reply, req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
-		req->r_con_filling_msg = NULL;
-	}
-
-	if (front > req->r_reply->front.iov_len) {
-		pr_warning("get_reply front %d > preallocated %d\n",
-			   front, (int)req->r_reply->front.iov_len);
-		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-		if (!m)
-			goto out;
-		ceph_msg_put(req->r_reply);
-		req->r_reply = m;
-	}
-	m = ceph_msg_get(req->r_reply);
-
-	if (data_len > 0) {
-		unsigned data_off = le16_to_cpu(hdr->data_off);
-		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
-		if (unlikely(req->r_num_pages < want)) {
-			pr_warning("tid %lld reply %d > expected %d pages\n",
-				   tid, want, m->nr_pages);
-			*skip = 1;
-			ceph_msg_put(m);
-			m = NULL;
-			goto out;
-		}
-		m->pages = req->r_pages;
-		m->nr_pages = req->r_num_pages;
-#ifdef CONFIG_BLOCK
-		m->bio = req->r_bio;
-#endif
-	}
-	*skip = 0;
-	req->r_con_filling_msg = ceph_con_get(con);
-	dout("get_reply tid %lld %p\n", tid, m);
-
-out:
-	mutex_unlock(&osdc->request_mutex);
-	return m;
-
-}
-
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-				  struct ceph_msg_header *hdr,
-				  int *skip)
-{
-	struct ceph_osd *osd = con->private;
-	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
-
-	switch (type) {
-	case CEPH_MSG_OSD_MAP:
-		return ceph_msg_new(type, front, GFP_NOFS);
-	case CEPH_MSG_OSD_OPREPLY:
-		return get_reply(con, hdr, skip);
-	default:
-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-			osd->o_osd);
-		*skip = 1;
-		return NULL;
-	}
-}
-
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
-	struct ceph_osd *osd = con->private;
-	if (get_osd(osd))
-		return con;
-	return NULL;
-}
-
-static void put_osd_con(struct ceph_connection *con)
-{
-	struct ceph_osd *osd = con->private;
-	put_osd(osd);
-}
-
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
-			  void **buf, int *len, int *proto,
-			  void **reply_buf, int *reply_len, int force_new)
-{
-	struct ceph_osd *o = con->private;
-	struct ceph_osd_client *osdc = o->o_osdc;
-	struct ceph_auth_client *ac = osdc->client->monc.auth;
-	int ret = 0;
-
-	if (force_new && o->o_authorizer) {
-		ac->ops->destroy_authorizer(ac, o->o_authorizer);
-		o->o_authorizer = NULL;
-	}
-	if (o->o_authorizer == NULL) {
-		ret = ac->ops->create_authorizer(
-			ac, CEPH_ENTITY_TYPE_OSD,
-			&o->o_authorizer,
-			&o->o_authorizer_buf,
-			&o->o_authorizer_buf_len,
-			&o->o_authorizer_reply_buf,
-			&o->o_authorizer_reply_buf_len);
-		if (ret)
-			return ret;
-	}
-
-	*proto = ac->protocol;
-	*buf = o->o_authorizer_buf;
-	*len = o->o_authorizer_buf_len;
-	*reply_buf = o->o_authorizer_reply_buf;
-	*reply_len = o->o_authorizer_reply_buf_len;
-	return 0;
-}
-
-
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
-	struct ceph_osd *o = con->private;
-	struct ceph_osd_client *osdc = o->o_osdc;
-	struct ceph_auth_client *ac = osdc->client->monc.auth;
-
-	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
-}
-
-static int invalidate_authorizer(struct ceph_connection *con)
-{
-	struct ceph_osd *o = con->private;
-	struct ceph_osd_client *osdc = o->o_osdc;
-	struct ceph_auth_client *ac = osdc->client->monc.auth;
-
-	if (ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-
-	return ceph_monc_validate_auth(&osdc->client->monc);
-}
-
-static const struct ceph_connection_operations osd_con_ops = {
-	.get = get_osd_con,
-	.put = put_osd_con,
-	.dispatch = dispatch,
-	.get_authorizer = get_authorizer,
-	.verify_authorizer_reply = verify_authorizer_reply,
-	.invalidate_authorizer = invalidate_authorizer,
-	.alloc_msg = alloc_msg,
-	.fault = osd_reset,
-};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index 65c9c560f1ac..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,233 +0,0 @@
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-struct ceph_authorizer;
-struct ceph_pagelist;
-
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-				     struct ceph_msg *);
-
-/* a given osd we're communicating with */
-struct ceph_osd {
-	atomic_t o_ref;
-	struct ceph_osd_client *o_osdc;
-	int o_osd;
-	int o_incarnation;
-	struct rb_node o_node;
-	struct ceph_connection o_con;
-	struct list_head o_requests;
-	struct list_head o_osd_lru;
-	struct ceph_authorizer *o_authorizer;
-	void *o_authorizer_buf, *o_authorizer_reply_buf;
-	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
-	unsigned long lru_ttl;
-	int o_marked_for_keepalive;
-	struct list_head o_keepalive_item;
-};
-
-/* an in-flight request */
-struct ceph_osd_request {
-	u64             r_tid;              /* unique for this client */
-	struct rb_node  r_node;
-	struct list_head r_req_lru_item;
-	struct list_head r_osd_item;
-	struct ceph_osd *r_osd;
-	struct ceph_pg   r_pgid;
-	int              r_pg_osds[CEPH_PG_MAX_SIZE];
-	int              r_num_pg_osds;
-
-	struct ceph_connection *r_con_filling_msg;
-
-	struct ceph_msg  *r_request, *r_reply;
-	int               r_result;
-	int               r_flags;     /* any additional flags for the osd */
-	u32               r_sent;      /* >0 if r_request is sending/sent */
-	int               r_got_reply;
-
-	struct ceph_osd_client *r_osdc;
-	struct kref       r_kref;
-	bool              r_mempool;
-	struct completion r_completion, r_safe_completion;
-	ceph_osdc_callback_t r_callback, r_safe_callback;
-	struct ceph_eversion r_reassert_version;
-	struct list_head  r_unsafe_item;
-
-	struct inode *r_inode;         	      /* for use by callbacks */
-
-	char              r_oid[40];          /* object name */
-	int               r_oid_len;
-	unsigned long     r_stamp;            /* send OR check time */
-	bool              r_resend;           /* msg send failed, needs retry */
-
-	struct ceph_file_layout r_file_layout;
-	struct ceph_snap_context *r_snapc;    /* snap context for writes */
-	unsigned          r_num_pages;        /* size of page array (follows) */
-	struct page     **r_pages;            /* pages for data payload */
-	int               r_pages_from_pool;
-	int               r_own_pages;        /* if true, i own page list */
-#ifdef CONFIG_BLOCK
-	struct bio       *r_bio;	      /* instead of pages */
-#endif
-
-	struct ceph_pagelist *r_trail;	      /* trailing part of the data */
-};
-
-struct ceph_osd_client {
-	struct ceph_client     *client;
-
-	struct ceph_osdmap     *osdmap;       /* current map */
-	struct rw_semaphore    map_sem;
-	struct completion      map_waiters;
-	u64                    last_requested_map;
-
-	struct mutex           request_mutex;
-	struct rb_root         osds;          /* osds */
-	struct list_head       osd_lru;       /* idle osds */
-	u64                    timeout_tid;   /* tid of timeout triggering rq */
-	u64                    last_tid;      /* tid of last request */
-	struct rb_root         requests;      /* pending requests */
-	struct list_head       req_lru;	      /* pending requests lru */
-	int                    num_requests;
-	struct delayed_work    timeout_work;
-	struct delayed_work    osds_timeout_work;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry 	       *debugfs_file;
-#endif
-
-	mempool_t              *req_mempool;
-
-	struct ceph_msgpool	msgpool_op;
-	struct ceph_msgpool	msgpool_op_reply;
-};
-
-struct ceph_osd_req_op {
-	u16 op;           /* CEPH_OSD_OP_* */
-	u32 flags;        /* CEPH_OSD_FLAG_* */
-	union {
-		struct {
-			u64 offset, length;
-			u64 truncate_size;
-			u32 truncate_seq;
-		} extent;
-		struct {
-			const char *name;
-			u32 name_len;
-			const char  *val;
-			u32 value_len;
-			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-		} xattr;
-		struct {
-			const char *class_name;
-			__u8 class_len;
-			const char *method_name;
-			__u8 method_len;
-			__u8 argc;
-			const char *indata;
-			u32 indata_len;
-		} cls;
-		struct {
-			u64 cookie, count;
-		} pgls;
-	        struct {
-		        u64 snapid;
-	        } snap;
-	};
-	u32 payload_len;
-};
-
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-			  struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
-				   struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
-				 struct ceph_msg *msg);
-
-extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-			struct ceph_file_layout *layout,
-			u64 snapid,
-			u64 off, u64 *plen, u64 *bno,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op);
-
-extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
-					       int flags,
-					       struct ceph_snap_context *snapc,
-					       struct ceph_osd_req_op *ops,
-					       bool use_mempool,
-					       gfp_t gfp_flags,
-					       struct page **pages,
-					       struct bio *bio);
-
-extern void ceph_osdc_build_request(struct ceph_osd_request *req,
-				    u64 off, u64 *plen,
-				    struct ceph_osd_req_op *src_ops,
-				    struct ceph_snap_context *snapc,
-				    struct timespec *mtime,
-				    const char *oid,
-				    int oid_len);
-
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
-				      struct ceph_file_layout *layout,
-				      struct ceph_vino vino,
-				      u64 offset, u64 *len, int op, int flags,
-				      struct ceph_snap_context *snapc,
-				      int do_sync, u32 truncate_seq,
-				      u64 truncate_size,
-				      struct timespec *mtime,
-				      bool use_mempool, int num_reply);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-	kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-	kref_put(&req->r_kref, ceph_osdc_release_request);
-}
-
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req,
-				   bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-				  struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-			       struct ceph_vino vino,
-			       struct ceph_file_layout *layout,
-			       u64 off, u64 *plen,
-			       u32 truncate_seq, u64 truncate_size,
-			       struct page **pages, int nr_pages);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
-				struct ceph_vino vino,
-				struct ceph_file_layout *layout,
-				struct ceph_snap_context *sc,
-				u64 off, u64 len,
-				u32 truncate_seq, u64 truncate_size,
-				struct timespec *mtime,
-				struct page **pages, int nr_pages,
-				int flags, int do_sync, bool nofail);
-
-#endif
-
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index 3ccd11761cb6..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-#include <asm/div64.h>
-
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
-	int flag = 0;
-
-	if (!len)
-		goto done;
-
-	*str = '\0';
-	if (state) {
-		if (state & CEPH_OSD_EXISTS) {
-			snprintf(str, len, "exists");
-			flag = 1;
-		}
-		if (state & CEPH_OSD_UP) {
-			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-				 "up");
-			flag = 1;
-		}
-	} else {
-		snprintf(str, len, "doesn't exist");
-	}
-done:
-	return str;
-}
-
-/* maps */
-
-static int calc_bits_of(unsigned t)
-{
-	int b = 0;
-	while (t) {
-		t = t >> 1;
-		b++;
-	}
-	return b;
-}
-
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
-	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-	pi->pgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-	pi->lpg_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-	pi->lpgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
-				       struct crush_bucket_uniform *b)
-{
-	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
-	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
-	b->item_weight = ceph_decode_32(p);
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static int crush_decode_list_bucket(void **p, void *end,
-				    struct crush_bucket_list *b)
-{
-	int j;
-	dout("crush_decode_list_bucket %p to %p\n", *p, end);
-	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->item_weights == NULL)
-		return -ENOMEM;
-	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->sum_weights == NULL)
-		return -ENOMEM;
-	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-	for (j = 0; j < b->h.size; j++) {
-		b->item_weights[j] = ceph_decode_32(p);
-		b->sum_weights[j] = ceph_decode_32(p);
-	}
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static int crush_decode_tree_bucket(void **p, void *end,
-				    struct crush_bucket_tree *b)
-{
-	int j;
-	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
-	ceph_decode_32_safe(p, end, b->num_nodes, bad);
-	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
-	if (b->node_weights == NULL)
-		return -ENOMEM;
-	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
-	for (j = 0; j < b->num_nodes; j++)
-		b->node_weights[j] = ceph_decode_32(p);
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static int crush_decode_straw_bucket(void **p, void *end,
-				     struct crush_bucket_straw *b)
-{
-	int j;
-	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
-	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->item_weights == NULL)
-		return -ENOMEM;
-	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->straws == NULL)
-		return -ENOMEM;
-	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-	for (j = 0; j < b->h.size; j++) {
-		b->item_weights[j] = ceph_decode_32(p);
-		b->straws[j] = ceph_decode_32(p);
-	}
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
-	struct crush_map *c;
-	int err = -EINVAL;
-	int i, j;
-	void **p = &pbyval;
-	void *start = pbyval;
-	u32 magic;
-
-	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-	c = kzalloc(sizeof(*c), GFP_NOFS);
-	if (c == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	ceph_decode_need(p, end, 4*sizeof(u32), bad);
-	magic = ceph_decode_32(p);
-	if (magic != CRUSH_MAGIC) {
-		pr_err("crush_decode magic %x != current %x\n",
-		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
-		goto bad;
-	}
-	c->max_buckets = ceph_decode_32(p);
-	c->max_rules = ceph_decode_32(p);
-	c->max_devices = ceph_decode_32(p);
-
-	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
-	if (c->device_parents == NULL)
-		goto badmem;
-	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
-	if (c->bucket_parents == NULL)
-		goto badmem;
-
-	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
-	if (c->buckets == NULL)
-		goto badmem;
-	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
-	if (c->rules == NULL)
-		goto badmem;
-
-	/* buckets */
-	for (i = 0; i < c->max_buckets; i++) {
-		int size = 0;
-		u32 alg;
-		struct crush_bucket *b;
-
-		ceph_decode_32_safe(p, end, alg, bad);
-		if (alg == 0) {
-			c->buckets[i] = NULL;
-			continue;
-		}
-		dout("crush_decode bucket %d off %x %p to %p\n",
-		     i, (int)(*p-start), *p, end);
-
-		switch (alg) {
-		case CRUSH_BUCKET_UNIFORM:
-			size = sizeof(struct crush_bucket_uniform);
-			break;
-		case CRUSH_BUCKET_LIST:
-			size = sizeof(struct crush_bucket_list);
-			break;
-		case CRUSH_BUCKET_TREE:
-			size = sizeof(struct crush_bucket_tree);
-			break;
-		case CRUSH_BUCKET_STRAW:
-			size = sizeof(struct crush_bucket_straw);
-			break;
-		default:
-			err = -EINVAL;
-			goto bad;
-		}
-		BUG_ON(size == 0);
-		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
-		if (b == NULL)
-			goto badmem;
-
-		ceph_decode_need(p, end, 4*sizeof(u32), bad);
-		b->id = ceph_decode_32(p);
-		b->type = ceph_decode_16(p);
-		b->alg = ceph_decode_8(p);
-		b->hash = ceph_decode_8(p);
-		b->weight = ceph_decode_32(p);
-		b->size = ceph_decode_32(p);
-
-		dout("crush_decode bucket size %d off %x %p to %p\n",
-		     b->size, (int)(*p-start), *p, end);
-
-		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
-		if (b->items == NULL)
-			goto badmem;
-		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-		if (b->perm == NULL)
-			goto badmem;
-		b->perm_n = 0;
-
-		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
-		for (j = 0; j < b->size; j++)
-			b->items[j] = ceph_decode_32(p);
-
-		switch (b->alg) {
-		case CRUSH_BUCKET_UNIFORM:
-			err = crush_decode_uniform_bucket(p, end,
-				  (struct crush_bucket_uniform *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		case CRUSH_BUCKET_LIST:
-			err = crush_decode_list_bucket(p, end,
-			       (struct crush_bucket_list *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		case CRUSH_BUCKET_TREE:
-			err = crush_decode_tree_bucket(p, end,
-				(struct crush_bucket_tree *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		case CRUSH_BUCKET_STRAW:
-			err = crush_decode_straw_bucket(p, end,
-				(struct crush_bucket_straw *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		}
-	}
-
-	/* rules */
-	dout("rule vec is %p\n", c->rules);
-	for (i = 0; i < c->max_rules; i++) {
-		u32 yes;
-		struct crush_rule *r;
-
-		ceph_decode_32_safe(p, end, yes, bad);
-		if (!yes) {
-			dout("crush_decode NO rule %d off %x %p to %p\n",
-			     i, (int)(*p-start), *p, end);
-			c->rules[i] = NULL;
-			continue;
-		}
-
-		dout("crush_decode rule %d off %x %p to %p\n",
-		     i, (int)(*p-start), *p, end);
-
-		/* len */
-		ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
-		err = -EINVAL;
-		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
-			goto bad;
-#endif
-		r = c->rules[i] = kmalloc(sizeof(*r) +
-					  yes*sizeof(struct crush_rule_step),
-					  GFP_NOFS);
-		if (r == NULL)
-			goto badmem;
-		dout(" rule %d is at %p\n", i, r);
-		r->len = yes;
-		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
-		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
-		for (j = 0; j < r->len; j++) {
-			r->steps[j].op = ceph_decode_32(p);
-			r->steps[j].arg1 = ceph_decode_32(p);
-			r->steps[j].arg2 = ceph_decode_32(p);
-		}
-	}
-
-	/* ignore trailing name maps. */
-
-	dout("crush_decode success\n");
-	return c;
-
-badmem:
-	err = -ENOMEM;
-bad:
-	dout("crush_decode fail %d\n", err);
-	crush_destroy(c);
-	return ERR_PTR(err);
-}
-
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
-	u64 a = *(u64 *)&l;
-	u64 b = *(u64 *)&r;
-
-	if (a < b)
-		return -1;
-	if (a > b)
-		return 1;
-	return 0;
-}
-
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-			       struct rb_root *root)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_pg_mapping *pg = NULL;
-	int c;
-
-	while (*p) {
-		parent = *p;
-		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		c = pgid_cmp(new->pgid, pg->pgid);
-		if (c < 0)
-			p = &(*p)->rb_left;
-		else if (c > 0)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, root);
-	return 0;
-}
-
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg pgid)
-{
-	struct rb_node *n = root->rb_node;
-	struct ceph_pg_mapping *pg;
-	int c;
-
-	while (n) {
-		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = pgid_cmp(pgid, pg->pgid);
-		if (c < 0)
-			n = n->rb_left;
-		else if (c > 0)
-			n = n->rb_right;
-		else
-			return pg;
-	}
-	return NULL;
-}
-
-/*
- * rbtree of pg pool info
- */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_pg_pool_info *pi = NULL;
-
-	while (*p) {
-		parent = *p;
-		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
-		if (new->id < pi->id)
-			p = &(*p)->rb_left;
-		else if (new->id > pi->id)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, root);
-	return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
-{
-	struct ceph_pg_pool_info *pi;
-	struct rb_node *n = root->rb_node;
-
-	while (n) {
-		pi = rb_entry(n, struct ceph_pg_pool_info, node);
-		if (id < pi->id)
-			n = n->rb_left;
-		else if (id > pi->id)
-			n = n->rb_right;
-		else
-			return pi;
-	}
-	return NULL;
-}
-
-int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
-{
-	struct rb_node *rbp;
-
-	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
-		struct ceph_pg_pool_info *pi =
-			rb_entry(rbp, struct ceph_pg_pool_info, node);
-		if (pi->name && strcmp(pi->name, name) == 0)
-			return pi->id;
-	}
-	return -ENOENT;
-}
-
-static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
-{
-	rb_erase(&pi->node, root);
-	kfree(pi->name);
-	kfree(pi);
-}
-
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
-{
-	unsigned n, m;
-
-	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-	calc_pg_masks(pi);
-
-	/* num_snaps * snap_info_t */
-	n = le32_to_cpu(pi->v.num_snaps);
-	while (n--) {
-		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-				 sizeof(struct ceph_timespec), bad);
-		*p += sizeof(u64) +       /* key */
-			1 + sizeof(u64) + /* u8, snapid */
-			sizeof(struct ceph_timespec);
-		m = ceph_decode_32(p);    /* snap name */
-		*p += m;
-	}
-
-	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
-	return 0;
-
-bad:
-	return -EINVAL;
-}
-
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
-{
-	struct ceph_pg_pool_info *pi;
-	u32 num, len, pool;
-
-	ceph_decode_32_safe(p, end, num, bad);
-	dout(" %d pool names\n", num);
-	while (num--) {
-		ceph_decode_32_safe(p, end, pool, bad);
-		ceph_decode_32_safe(p, end, len, bad);
-		dout("  pool %d len %d\n", pool, len);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (pi) {
-			kfree(pi->name);
-			pi->name = kmalloc(len + 1, GFP_NOFS);
-			if (pi->name) {
-				memcpy(pi->name, *p, len);
-				pi->name[len] = '\0';
-				dout("  name is %s\n", pi->name);
-			}
-		}
-		*p += len;
-	}
-	return 0;
-
-bad:
-	return -EINVAL;
-}
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-	dout("osdmap_destroy %p\n", map);
-	if (map->crush)
-		crush_destroy(map->crush);
-	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-		struct ceph_pg_mapping *pg =
-			rb_entry(rb_first(&map->pg_temp),
-				 struct ceph_pg_mapping, node);
-		rb_erase(&pg->node, &map->pg_temp);
-		kfree(pg);
-	}
-	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-		struct ceph_pg_pool_info *pi =
-			rb_entry(rb_first(&map->pg_pools),
-				 struct ceph_pg_pool_info, node);
-		__remove_pg_pool(&map->pg_pools, pi);
-	}
-	kfree(map->osd_state);
-	kfree(map->osd_weight);
-	kfree(map->osd_addr);
-	kfree(map);
-}
-
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-	u8 *state;
-	struct ceph_entity_addr *addr;
-	u32 *weight;
-
-	state = kcalloc(max, sizeof(*state), GFP_NOFS);
-	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-	if (state == NULL || addr == NULL || weight == NULL) {
-		kfree(state);
-		kfree(addr);
-		kfree(weight);
-		return -ENOMEM;
-	}
-
-	/* copy old? */
-	if (map->osd_state) {
-		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-		kfree(map->osd_state);
-		kfree(map->osd_addr);
-		kfree(map->osd_weight);
-	}
-
-	map->osd_state = state;
-	map->osd_weight = weight;
-	map->osd_addr = addr;
-	map->max_osd = max;
-	return 0;
-}
-
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
-	struct ceph_osdmap *map;
-	u16 version;
-	u32 len, max, i;
-	u8 ev;
-	int err = -EINVAL;
-	void *start = *p;
-	struct ceph_pg_pool_info *pi;
-
-	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (map == NULL)
-		return ERR_PTR(-ENOMEM);
-	map->pg_temp = RB_ROOT;
-
-	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_VERSION) {
-		pr_warning("got unknown v %d > %d of osdmap\n", version,
-			   CEPH_OSDMAP_VERSION);
-		goto bad;
-	}
-
-	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
-	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-	map->epoch = ceph_decode_32(p);
-	ceph_decode_copy(p, &map->created, sizeof(map->created));
-	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-
-	ceph_decode_32_safe(p, end, max, bad);
-	while (max--) {
-		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-		pi = kzalloc(sizeof(*pi), GFP_NOFS);
-		if (!pi)
-			goto bad;
-		pi->id = ceph_decode_32(p);
-		ev = ceph_decode_8(p); /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			kfree(pi);
-			goto bad;
-		}
-		err = __decode_pool(p, end, pi);
-		if (err < 0)
-			goto bad;
-		__insert_pg_pool(&map->pg_pools, pi);
-	}
-
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
-
-	ceph_decode_32_safe(p, end, map->pool_max, bad);
-
-	ceph_decode_32_safe(p, end, map->flags, bad);
-
-	max = ceph_decode_32(p);
-
-	/* (re)alloc osd arrays */
-	err = osdmap_set_max_osd(map, max);
-	if (err < 0)
-		goto bad;
-	dout("osdmap_decode max_osd = %d\n", map->max_osd);
-
-	/* osds */
-	err = -EINVAL;
-	ceph_decode_need(p, end, 3*sizeof(u32) +
-			 map->max_osd*(1 + sizeof(*map->osd_weight) +
-				       sizeof(*map->osd_addr)), bad);
-	*p += 4; /* skip length field (should match max) */
-	ceph_decode_copy(p, map->osd_state, map->max_osd);
-
-	*p += 4; /* skip length field (should match max) */
-	for (i = 0; i < map->max_osd; i++)
-		map->osd_weight[i] = ceph_decode_32(p);
-
-	*p += 4; /* skip length field (should match max) */
-	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
-	for (i = 0; i < map->max_osd; i++)
-		ceph_decode_addr(&map->osd_addr[i]);
-
-	/* pg_temp */
-	ceph_decode_32_safe(p, end, len, bad);
-	for (i = 0; i < len; i++) {
-		int n, j;
-		struct ceph_pg pgid;
-		struct ceph_pg_mapping *pg;
-
-		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
-		n = ceph_decode_32(p);
-		ceph_decode_need(p, end, n * sizeof(u32), bad);
-		err = -ENOMEM;
-		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-		if (!pg)
-			goto bad;
-		pg->pgid = pgid;
-		pg->len = n;
-		for (j = 0; j < n; j++)
-			pg->osds[j] = ceph_decode_32(p);
-
-		err = __insert_pg_mapping(pg, &map->pg_temp);
-		if (err)
-			goto bad;
-		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
-	}
-
-	/* crush */
-	ceph_decode_32_safe(p, end, len, bad);
-	dout("osdmap_decode crush len %d from off 0x%x\n", len,
-	     (int)(*p - start));
-	ceph_decode_need(p, end, len, bad);
-	map->crush = crush_decode(*p, end);
-	*p += len;
-	if (IS_ERR(map->crush)) {
-		err = PTR_ERR(map->crush);
-		map->crush = NULL;
-		goto bad;
-	}
-
-	/* ignore the rest of the map */
-	*p = end;
-
-	dout("osdmap_decode done %p %p\n", *p, end);
-	return map;
-
-bad:
-	dout("osdmap_decode fail\n");
-	ceph_osdmap_destroy(map);
-	return ERR_PTR(err);
-}
-
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					     struct ceph_osdmap *map,
-					     struct ceph_messenger *msgr)
-{
-	struct crush_map *newcrush = NULL;
-	struct ceph_fsid fsid;
-	u32 epoch = 0;
-	struct ceph_timespec modified;
-	u32 len, pool;
-	__s32 new_pool_max, new_flags, max;
-	void *start = *p;
-	int err = -EINVAL;
-	u16 version;
-	struct rb_node *rbp;
-
-	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_INC_VERSION) {
-		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-			   CEPH_OSDMAP_INC_VERSION);
-		goto bad;
-	}
-
-	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-			 bad);
-	ceph_decode_copy(p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(p);
-	BUG_ON(epoch != map->epoch+1);
-	ceph_decode_copy(p, &modified, sizeof(modified));
-	new_pool_max = ceph_decode_32(p);
-	new_flags = ceph_decode_32(p);
-
-	/* full map? */
-	ceph_decode_32_safe(p, end, len, bad);
-	if (len > 0) {
-		dout("apply_incremental full map len %d, %p to %p\n",
-		     len, *p, end);
-		return osdmap_decode(p, min(*p+len, end));
-	}
-
-	/* new crush? */
-	ceph_decode_32_safe(p, end, len, bad);
-	if (len > 0) {
-		dout("apply_incremental new crush map len %d, %p to %p\n",
-		     len, *p, end);
-		newcrush = crush_decode(*p, min(*p+len, end));
-		if (IS_ERR(newcrush))
-			return ERR_CAST(newcrush);
-		*p += len;
-	}
-
-	/* new flags? */
-	if (new_flags >= 0)
-		map->flags = new_flags;
-	if (new_pool_max >= 0)
-		map->pool_max = new_pool_max;
-
-	ceph_decode_need(p, end, 5*sizeof(u32), bad);
-
-	/* new max? */
-	max = ceph_decode_32(p);
-	if (max >= 0) {
-		err = osdmap_set_max_osd(map, max);
-		if (err < 0)
-			goto bad;
-	}
-
-	map->epoch++;
-	map->modified = map->modified;
-	if (newcrush) {
-		if (map->crush)
-			crush_destroy(map->crush);
-		map->crush = newcrush;
-		newcrush = NULL;
-	}
-
-	/* new_pool */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		__u8 ev;
-		struct ceph_pg_pool_info *pi;
-
-		ceph_decode_32_safe(p, end, pool, bad);
-		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-		ev = ceph_decode_8(p);  /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			goto bad;
-		}
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (!pi) {
-			pi = kzalloc(sizeof(*pi), GFP_NOFS);
-			if (!pi) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			pi->id = pool;
-			__insert_pg_pool(&map->pg_pools, pi);
-		}
-		err = __decode_pool(p, end, pi);
-		if (err < 0)
-			goto bad;
-	}
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
-
-	/* old_pool */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		struct ceph_pg_pool_info *pi;
-
-		ceph_decode_32_safe(p, end, pool, bad);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (pi)
-			__remove_pg_pool(&map->pg_pools, pi);
-	}
-
-	/* new_up */
-	err = -EINVAL;
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd;
-		struct ceph_entity_addr addr;
-		ceph_decode_32_safe(p, end, osd, bad);
-		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
-		ceph_decode_addr(&addr);
-		pr_info("osd%d up\n", osd);
-		BUG_ON(osd >= map->max_osd);
-		map->osd_state[osd] |= CEPH_OSD_UP;
-		map->osd_addr[osd] = addr;
-	}
-
-	/* new_down */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd;
-		ceph_decode_32_safe(p, end, osd, bad);
-		(*p)++;  /* clean flag */
-		pr_info("osd%d down\n", osd);
-		if (osd < map->max_osd)
-			map->osd_state[osd] &= ~CEPH_OSD_UP;
-	}
-
-	/* new_weight */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd, off;
-		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		osd = ceph_decode_32(p);
-		off = ceph_decode_32(p);
-		pr_info("osd%d weight 0x%x %s\n", osd, off,
-		     off == CEPH_OSD_IN ? "(in)" :
-		     (off == CEPH_OSD_OUT ? "(out)" : ""));
-		if (osd < map->max_osd)
-			map->osd_weight[osd] = off;
-	}
-
-	/* new_pg_temp */
-	rbp = rb_first(&map->pg_temp);
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		struct ceph_pg_mapping *pg;
-		int j;
-		struct ceph_pg pgid;
-		u32 pglen;
-		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
-		pglen = ceph_decode_32(p);
-
-		/* remove any? */
-		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
-						node)->pgid, pgid) <= 0) {
-			struct ceph_pg_mapping *cur =
-				rb_entry(rbp, struct ceph_pg_mapping, node);
-
-			rbp = rb_next(rbp);
-			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-			rb_erase(&cur->node, &map->pg_temp);
-			kfree(cur);
-		}
-
-		if (pglen) {
-			/* insert */
-			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-			if (!pg) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			pg->pgid = pgid;
-			pg->len = pglen;
-			for (j = 0; j < pglen; j++)
-				pg->osds[j] = ceph_decode_32(p);
-			err = __insert_pg_mapping(pg, &map->pg_temp);
-			if (err) {
-				kfree(pg);
-				goto bad;
-			}
-			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-			     pglen);
-		}
-	}
-	while (rbp) {
-		struct ceph_pg_mapping *cur =
-			rb_entry(rbp, struct ceph_pg_mapping, node);
-
-		rbp = rb_next(rbp);
-		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-		rb_erase(&cur->node, &map->pg_temp);
-		kfree(cur);
-	}
-
-	/* ignore the rest */
-	*p = end;
-	return map;
-
-bad:
-	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
-	       epoch, (int)(*p - start), *p, start, end);
-	print_hex_dump(KERN_DEBUG, "osdmap: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       start, end - start, true);
-	if (newcrush)
-		crush_destroy(newcrush);
-	return ERR_PTR(err);
-}
-
-
-
-
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-				   u64 off, u64 *plen,
-				   u64 *ono,
-				   u64 *oxoff, u64 *oxlen)
-{
-	u32 osize = le32_to_cpu(layout->fl_object_size);
-	u32 su = le32_to_cpu(layout->fl_stripe_unit);
-	u32 sc = le32_to_cpu(layout->fl_stripe_count);
-	u32 bl, stripeno, stripepos, objsetno;
-	u32 su_per_object;
-	u64 t, su_offset;
-
-	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
-	     osize, su);
-	su_per_object = osize / su;
-	dout("osize %u / su %u = su_per_object %u\n", osize, su,
-	     su_per_object);
-
-	BUG_ON((su & ~PAGE_MASK) != 0);
-	/* bl = *off / su; */
-	t = off;
-	do_div(t, su);
-	bl = t;
-	dout("off %llu / su %u = bl %u\n", off, su, bl);
-
-	stripeno = bl / sc;
-	stripepos = bl % sc;
-	objsetno = stripeno / su_per_object;
-
-	*ono = objsetno * sc + stripepos;
-	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
-
-	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
-	t = off;
-	su_offset = do_div(t, su);
-	*oxoff = su_offset + (stripeno % su_per_object) * su;
-
-	/*
-	 * Calculate the length of the extent being written to the selected
-	 * object. This is the minimum of the full length requested (plen) or
-	 * the remainder of the current stripe being written to.
-	 */
-	*oxlen = min_t(u64, *plen, su - su_offset);
-	*plen = *oxlen;
-
-	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
-			    const char *oid,
-			    struct ceph_file_layout *fl,
-			    struct ceph_osdmap *osdmap)
-{
-	unsigned num, num_mask;
-	struct ceph_pg pgid;
-	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
-	int poolid = le32_to_cpu(fl->fl_pg_pool);
-	struct ceph_pg_pool_info *pool;
-	unsigned ps;
-
-	BUG_ON(!osdmap);
-
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-	if (!pool)
-		return -EIO;
-	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-	if (preferred >= 0) {
-		ps += preferred;
-		num = le32_to_cpu(pool->v.lpg_num);
-		num_mask = pool->lpg_num_mask;
-	} else {
-		num = le32_to_cpu(pool->v.pg_num);
-		num_mask = pool->pg_num_mask;
-	}
-
-	pgid.ps = cpu_to_le16(ps);
-	pgid.preferred = cpu_to_le16(preferred);
-	pgid.pool = fl->fl_pg_pool;
-	if (preferred >= 0)
-		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
-		     (int)preferred);
-	else
-		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
-
-	ol->ol_pgid = pgid;
-	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-	return 0;
-}
-
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds, int *num)
-{
-	struct ceph_pg_mapping *pg;
-	struct ceph_pg_pool_info *pool;
-	int ruleno;
-	unsigned poolid, ps, pps;
-	int preferred;
-
-	/* pg_temp? */
-	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-	if (pg) {
-		*num = pg->len;
-		return pg->osds;
-	}
-
-	/* crush */
-	poolid = le32_to_cpu(pgid.pool);
-	ps = le16_to_cpu(pgid.ps);
-	preferred = (s16)le16_to_cpu(pgid.preferred);
-
-	/* don't forcefeed bad device ids to crush */
-	if (preferred >= osdmap->max_osd ||
-	    preferred >= osdmap->crush->max_devices)
-		preferred = -1;
-
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-	if (!pool)
-		return NULL;
-	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-				 pool->v.type, pool->v.size);
-	if (ruleno < 0) {
-		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-		       poolid, pool->v.crush_ruleset, pool->v.type,
-		       pool->v.size);
-		return NULL;
-	}
-
-	if (preferred >= 0)
-		pps = ceph_stable_mod(ps,
-				      le32_to_cpu(pool->v.lpgp_num),
-				      pool->lpgp_num_mask);
-	else
-		pps = ceph_stable_mod(ps,
-				      le32_to_cpu(pool->v.pgp_num),
-				      pool->pgp_num_mask);
-	pps += poolid;
-	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-			     min_t(int, pool->v.size, *num),
-			     preferred, osdmap->osd_weight);
-	return osds;
-}
-
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *acting)
-{
-	int rawosds[CEPH_PG_MAX_SIZE], *osds;
-	int i, o, num = CEPH_PG_MAX_SIZE;
-
-	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-	if (!osds)
-		return -1;
-
-	/* primary is first up osd */
-	o = 0;
-	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i]))
-			acting[o++] = osds[i];
-	return o;
-}
-
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
-{
-	int rawosds[CEPH_PG_MAX_SIZE], *osds;
-	int i, num = CEPH_PG_MAX_SIZE;
-
-	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-	if (!osds)
-		return -1;
-
-	/* primary is first up osd */
-	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i]))
-			return osds[i];
-	return -1;
-}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index a592b211be39..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds.  That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
-	struct rb_node node;
-	int id;
-	struct ceph_pg_pool v;
-	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
-	char *name;
-};
-
-struct ceph_pg_mapping {
-	struct rb_node node;
-	struct ceph_pg pgid;
-	int len;
-	int osds[];
-};
-
-struct ceph_osdmap {
-	struct ceph_fsid fsid;
-	u32 epoch;
-	u32 mkfs_epoch;
-	struct ceph_timespec created, modified;
-
-	u32 flags;         /* CEPH_OSDMAP_* */
-
-	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
-	u8 *osd_state;     /* CEPH_OSD_* */
-	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
-	struct ceph_entity_addr *osd_addr;
-
-	struct rb_root pg_temp;
-	struct rb_root pg_pools;
-	u32 pool_max;
-
-	/* the CRUSH map specifies the mapping of placement groups to
-	 * the list of osds that store+replicate them. */
-	struct crush_map *crush;
-};
-
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-	((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
-	((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
-	((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-	return le32_to_cpu(l->fl_stripe_unit) *
-		le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-	return le32_to_cpu(l->fl_object_size) *
-		le32_to_cpu(l->fl_stripe_count);
-}
-
-
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
-	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-	return map && (map->flags & flag);
-}
-
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
-						     int osd)
-{
-	if (osd >= map->max_osd)
-		return NULL;
-	return &map->osd_addr[osd];
-}
-
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					    struct ceph_osdmap *map,
-					    struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					  u64 off, u64 *plen,
-					  u64 *bno, u64 *oxoff, u64 *oxlen);
-
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
-				   const char *oid,
-				   struct ceph_file_layout *fl,
-				   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			       int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
-
-extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
-
-#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 326e1c04176f..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-
-#include "pagelist.h"
-
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
-	struct page *page = list_entry(pl->head.prev, struct page,
-				       lru);
-	kunmap(page);
-}
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-	if (pl->mapped_tail)
-		ceph_pagelist_unmap_tail(pl);
-
-	while (!list_empty(&pl->head)) {
-		struct page *page = list_first_entry(&pl->head, struct page,
-						     lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-	struct page *page = __page_cache_alloc(GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
-	pl->room += PAGE_SIZE;
-	list_add_tail(&page->lru, &pl->head);
-	if (pl->mapped_tail)
-		ceph_pagelist_unmap_tail(pl);
-	pl->mapped_tail = kmap(page);
-	return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
-{
-	while (pl->room < len) {
-		size_t bit = pl->room;
-		int ret;
-
-		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-		       buf, bit);
-		pl->length += bit;
-		pl->room -= bit;
-		buf += bit;
-		len -= bit;
-		ret = ceph_pagelist_addpage(pl);
-		if (ret)
-			return ret;
-	}
-
-	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-	pl->length += len;
-	pl->room -= len;
-	return 0;
-}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index cc9327aa1c98..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __FS_CEPH_PAGELIST_H
-#define __FS_CEPH_PAGELIST_H
-
-#include <linux/list.h>
-
-struct ceph_pagelist {
-	struct list_head head;
-	void *mapped_tail;
-	size_t length;
-	size_t room;
-};
-
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
-	INIT_LIST_HEAD(&pl->head);
-	pl->mapped_tail = NULL;
-	pl->length = 0;
-	pl->room = 0;
-}
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
-
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
-
-static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
-{
-	__le64 ev = cpu_to_le64(v);
-	return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
-{
-	__le32 ev = cpu_to_le32(v);
-	return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
-{
-	__le16 ev = cpu_to_le16(v);
-	return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
-{
-	return ceph_pagelist_append(pl, &v, 1);
-}
-static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
-					      char *s, size_t len)
-{
-	int ret = ceph_pagelist_encode_32(pl, len);
-	if (ret)
-		return ret;
-	if (len)
-		return ceph_pagelist_append(pl, s, len);
-	return 0;
-}
-
-#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
-#ifndef CEPH_RADOS_H
-#define CEPH_RADOS_H
-
-/*
- * Data types for the Ceph distributed object storage layer RADOS
- * (Reliable Autonomic Distributed Object Store).
- */
-
-#include "msgr.h"
-
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 5
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     5
-
-/*
- * fs id
- */
-struct ceph_fsid {
-	unsigned char fsid[16];
-};
-
-static inline int ceph_fsid_compare(const struct ceph_fsid *a,
-				    const struct ceph_fsid *b)
-{
-	return memcmp(a, b, sizeof(*a));
-}
-
-/*
- * ino, object, etc.
- */
-typedef __le64 ceph_snapid_t;
-#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
-#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
-#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
-
-struct ceph_timespec {
-	__le32 tv_sec;
-	__le32 tv_nsec;
-} __attribute__ ((packed));
-
-
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH     1
-#define CEPH_OBJECT_LAYOUT_LINEAR   2
-#define CEPH_OBJECT_LAYOUT_HASHINO  3
-
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH  0
-#define CEPH_PG_LAYOUT_HASH   1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
-
-/*
- * placement group.
- * we encode this into one __le64.
- */
-struct ceph_pg {
-	__le16 preferred; /* preferred primary osd */
-	__le16 ps;        /* placement seed */
-	__le32 pool;      /* object pool */
-} __attribute__ ((packed));
-
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- *  pg_num -- base number of pseudorandomly placed pgs
- *
- *  pgp_num -- effective number when calculating pg placement.  this
- * is used for pg_num increases.  new pgs result in data being "split"
- * into new pgs.  for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split.  only _then_ are the new
- * pgs placed independently.
- *
- *  lpg_num -- localized pg count (per device).  replicas are randomly
- * selected.
- *
- *  lpgp_num -- as above.
- */
-#define CEPH_PG_TYPE_REP     1
-#define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-	__u8 type;                /* CEPH_PG_TYPE_* */
-	__u8 size;                /* number of osds in each pg */
-	__u8 crush_ruleset;       /* crush placement rule */
-	__u8 object_hash;         /* hash mapping object name to ps */
-	__le32 pg_num, pgp_num;   /* number of pg's */
-	__le32 lpg_num, lpgp_num; /* number of localized pg's */
-	__le32 last_change;       /* most recent epoch changed */
-	__le64 snap_seq;          /* seq for per-pool snapshot */
-	__le32 snap_epoch;        /* epoch of last snap */
-	__le32 num_snaps;
-	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-	__le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
-
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time.  b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
-	if ((x & bmask) < b)
-		return x & bmask;
-	else
-		return x & (bmask >> 1);
-}
-
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
-	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
-	__le32 ol_stripe_unit;    /* for per-object parity, if any */
-} __attribute__ ((packed));
-
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
-	__le32 epoch;
-	__le64 version;
-} __attribute__ ((packed));
-
-/*
- * osd map bits
- */
-
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP     2
-
-/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN  0x10000
-#define CEPH_OSD_OUT 0
-
-
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
-
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE       0xf000
-#define CEPH_OSD_OP_MODE_RD    0x1000
-#define CEPH_OSD_OP_MODE_WR    0x2000
-#define CEPH_OSD_OP_MODE_RMW   0x3000
-#define CEPH_OSD_OP_MODE_SUB   0x4000
-
-#define CEPH_OSD_OP_TYPE       0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK  0x0100
-#define CEPH_OSD_OP_TYPE_DATA  0x0200
-#define CEPH_OSD_OP_TYPE_ATTR  0x0300
-#define CEPH_OSD_OP_TYPE_EXEC  0x0400
-#define CEPH_OSD_OP_TYPE_PG    0x0500
-
-enum {
-	/** data **/
-	/* read */
-	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
-	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-
-	/* fancy read */
-	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-
-	/* write */
-	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
-	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
-	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
-	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
-	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-
-	/* fancy write */
-	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
-	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
-	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
-	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-
-	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
-	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
-	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-
-	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
-	CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
-
-	/** attrs **/
-	/* read */
-	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
-	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
-	CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-
-	/* write */
-	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
-	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
-	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
-	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-
-	/** subop **/
-	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
-	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
-	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
-	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
-	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
-
-	/** lock **/
-	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
-	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
-	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
-	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
-	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
-	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-
-	/** exec **/
-	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-
-	/** pg **/
-	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
-};
-
-static inline int ceph_osd_op_type_lock(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-static inline int ceph_osd_op_type_exec(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
-}
-static inline int ceph_osd_op_type_pg(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
-}
-
-static inline int ceph_osd_op_mode_subop(int op)
-{
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-
-/*
- * note that the following tmap stuff is also defined in the ceph librados.h
- * any modification here needs to be updated there
- */
-#define CEPH_OSD_TMAP_HDR 'h'
-#define CEPH_OSD_TMAP_SET 's'
-#define CEPH_OSD_TMAP_RM  'r'
-
-extern const char *ceph_osd_op_name(int op);
-
-
-/*
- * osd op flags
- *
- * An op may be READ, WRITE, or READ|WRITE.
- */
-enum {
-	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
-	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
-	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
-	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
-	CEPH_OSD_FLAG_READ = 16,        /* op may read */
-	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
-	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
-	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
-	CEPH_OSD_FLAG_BALANCE_READS = 256,
-	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
-	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
-	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
-	CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
-};
-
-enum {
-	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
-};
-
-#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-
-/* xattr comparison */
-enum {
-	CEPH_OSD_CMPXATTR_OP_NOP = 0,
-	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
-	CEPH_OSD_CMPXATTR_OP_NE  = 2,
-	CEPH_OSD_CMPXATTR_OP_GT  = 3,
-	CEPH_OSD_CMPXATTR_OP_GTE = 4,
-	CEPH_OSD_CMPXATTR_OP_LT  = 5,
-	CEPH_OSD_CMPXATTR_OP_LTE = 6
-};
-
-enum {
-	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
-	CEPH_OSD_CMPXATTR_MODE_U64    = 2
-};
-
-/*
- * an individual object operation.  each may be accompanied by some data
- * payload
- */
-struct ceph_osd_op {
-	__le16 op;           /* CEPH_OSD_OP_* */
-	__le32 flags;        /* CEPH_OSD_FLAG_* */
-	union {
-		struct {
-			__le64 offset, length;
-			__le64 truncate_size;
-			__le32 truncate_seq;
-		} __attribute__ ((packed)) extent;
-		struct {
-			__le32 name_len;
-			__le32 value_len;
-			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-		} __attribute__ ((packed)) xattr;
-		struct {
-			__u8 class_len;
-			__u8 method_len;
-			__u8 argc;
-			__le32 indata_len;
-		} __attribute__ ((packed)) cls;
-		struct {
-			__le64 cookie, count;
-		} __attribute__ ((packed)) pgls;
-	        struct {
-		        __le64 snapid;
-	        } __attribute__ ((packed)) snap;
-	};
-	__le32 payload_len;
-} __attribute__ ((packed));
-
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-	__le32 client_inc;                 /* client incarnation */
-	struct ceph_object_layout layout;  /* pgid */
-	__le32 osdmap_epoch;               /* client's osdmap epoch */
-
-	__le32 flags;
-
-	struct ceph_timespec mtime;        /* for mutations only */
-	struct ceph_eversion reassert_version; /* if we are replaying op */
-
-	__le32 object_len;     /* length of object name */
-
-	__le64 snapid;         /* snapid to read */
-	__le64 snap_seq;       /* writer's snap context */
-	__le32 num_snaps;
-
-	__le16 num_ops;
-	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
-	__le32 client_inc;                /* client incarnation */
-	__le32 flags;
-	struct ceph_object_layout layout;
-	__le32 osdmap_epoch;
-	struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
-	__le32 result;                    /* result code */
-
-	__le32 object_len;                /* length of object name */
-	__le32 num_ops;
-	struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-
-
-#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/sort.h>
 #include <linux/slab.h>
 
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
 
 /*
  * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
 {
 	struct inode *inode = &ci->vfs_inode;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		      struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	int mds = session->s_mds;
 	u64 split;
 	int op;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
new file mode 100644
index 000000000000..cd5097d7c804
--- /dev/null
+++ b/fs/ceph/strings.c
@@ -0,0 +1,117 @@
+/*
+ * Ceph fs string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/backing-dev.h>
 #include <linux/ctype.h>
@@ -15,10 +15,13 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
 
-#include "decode.h"
 #include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 /*
  * Ceph superblock operations
@@ -26,36 +29,22 @@
  * Handle the basics of mounting, unmounting.
  */
 
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-	const char *e = s + len;
-
-	while (e != s && *(e-1) != '/')
-		e--;
-	return e;
-}
-
-
 /*
  * super ops
  */
 static void ceph_put_super(struct super_block *s)
 {
-	struct ceph_client *client = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 
 	dout("put_super\n");
-	ceph_mdsc_close_sessions(&client->mdsc);
+	ceph_mdsc_close_sessions(fsc->mdsc);
 
 	/*
 	 * ensure we release the bdi before put_anon_super releases
 	 * the device name.
 	 */
-	if (s->s_bdi == &client->backing_dev_info) {
-		bdi_unregister(&client->backing_dev_info);
+	if (s->s_bdi == &fsc->backing_dev_info) {
+		bdi_unregister(&fsc->backing_dev_info);
 		s->s_bdi = NULL;
 	}
 
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
 
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
-	struct ceph_monmap *monmap = client->monc.monmap;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = fsc->client->monc.monmap;
 	struct ceph_statfs st;
 	u64 fsid;
 	int err;
 
 	dout("statfs\n");
-	err = ceph_monc_do_statfs(&client->monc, &st);
+	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
 	if (err < 0)
 		return err;
 
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 
 	if (!wait) {
 		dout("sync_fs (non-blocking)\n");
-		ceph_flush_dirty_caps(&client->mdsc);
+		ceph_flush_dirty_caps(fsc->mdsc);
 		dout("sync_fs (non-blocking) done\n");
 		return 0;
 	}
 
 	dout("sync_fs (blocking)\n");
-	ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-	ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+	ceph_osdc_sync(&fsc->client->osdc);
+	ceph_mdsc_sync(fsc->mdsc);
 	dout("sync_fs (blocking) done\n");
 	return 0;
 }
 
-static int default_congestion_kb(void)
-{
-	int congestion_kb;
-
-	/*
-	 * Copied from NFS
-	 *
-	 * congestion size, scale with available memory.
-	 *
-	 *  64MB:    8192k
-	 * 128MB:   11585k
-	 * 256MB:   16384k
-	 * 512MB:   23170k
-	 *   1GB:   32768k
-	 *   2GB:   46340k
-	 *   4GB:   65536k
-	 *   8GB:   92681k
-	 *  16GB:  131072k
-	 *
-	 * This allows larger machines to have larger/more transfers.
-	 * Limit the default to 256M
-	 */
-	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-	if (congestion_kb > 256*1024)
-		congestion_kb = 256*1024;
-
-	return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-	struct ceph_mount_args *args = client->mount_args;
-
-	if (args->flags & CEPH_OPT_FSID)
-		seq_printf(m, ",fsid=%pU", &args->fsid);
-	if (args->flags & CEPH_OPT_NOSHARE)
-		seq_puts(m, ",noshare");
-	if (args->flags & CEPH_OPT_DIRSTAT)
-		seq_puts(m, ",dirstat");
-	if ((args->flags & CEPH_OPT_RBYTES) == 0)
-		seq_puts(m, ",norbytes");
-	if (args->flags & CEPH_OPT_NOCRC)
-		seq_puts(m, ",nocrc");
-	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-		seq_puts(m, ",noasyncreaddir");
-
-	if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-		seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-	if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-		seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-	if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-		seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-	if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-		seq_printf(m, ",osdkeepalivetimeout=%d",
-			 args->osd_keepalive_timeout);
-	if (args->wsize)
-		seq_printf(m, ",wsize=%d", args->wsize);
-	if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-		seq_printf(m, ",rsize=%d", args->rsize);
-	if (args->congestion_kb != default_congestion_kb())
-		seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-	if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-		seq_printf(m, ",caps_wanted_delay_min=%d",
-			 args->caps_wanted_delay_min);
-	if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-		seq_printf(m, ",caps_wanted_delay_max=%d",
-			   args->caps_wanted_delay_max);
-	if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-		seq_printf(m, ",cap_release_safety=%d",
-			   args->cap_release_safety);
-	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-	if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-		seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-	if (args->name)
-		seq_printf(m, ",name=%s", args->name);
-	if (args->secret)
-		seq_puts(m, ",secret=<hidden>");
-	return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
-	struct ceph_inode_info *ci = foo;
-	inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
-	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-				      sizeof(struct ceph_inode_info),
-				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
-	if (ceph_inode_cachep == NULL)
-		return -ENOMEM;
-
-	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_cap_cachep == NULL)
-		goto bad_cap;
-
-	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_dentry_cachep == NULL)
-		goto bad_dentry;
-
-	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_file_cachep == NULL)
-		goto bad_file;
-
-	return 0;
-
-bad_file:
-	kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-	kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-	kmem_cache_destroy(ceph_inode_cachep);
-	return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
-	kmem_cache_destroy(ceph_inode_cachep);
-	kmem_cache_destroy(ceph_cap_cachep);
-	kmem_cache_destroy(ceph_dentry_cachep);
-	kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-	struct ceph_client *client = ceph_sb_to_client(sb);
-
-	dout("ceph_umount_begin - starting forced umount\n");
-	if (!client)
-		return;
-	client->mount_state = CEPH_MOUNT_SHUTDOWN;
-	return;
-}
-
-static const struct super_operations ceph_super_ops = {
-	.alloc_inode	= ceph_alloc_inode,
-	.destroy_inode	= ceph_destroy_inode,
-	.write_inode    = ceph_write_inode,
-	.sync_fs        = ceph_sync_fs,
-	.put_super	= ceph_put_super,
-	.show_options   = ceph_show_options,
-	.statfs		= ceph_statfs,
-	.umount_begin   = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
-	switch (type) {
-	case CEPH_MSG_SHUTDOWN: return "shutdown";
-	case CEPH_MSG_PING: return "ping";
-	case CEPH_MSG_AUTH: return "auth";
-	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-	case CEPH_MSG_MON_MAP: return "mon_map";
-	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-	case CEPH_MSG_STATFS: return "statfs";
-	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-	case CEPH_MSG_MDS_MAP: return "mds_map";
-	case CEPH_MSG_CLIENT_SESSION: return "client_session";
-	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-	case CEPH_MSG_OSD_MAP: return "osd_map";
-	case CEPH_MSG_OSD_OP: return "osd_op";
-	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-	default: return "unknown";
-	}
-}
-
-
 /*
  * mount options
  */
 enum {
 	Opt_wsize,
 	Opt_rsize,
-	Opt_osdtimeout,
-	Opt_osdkeepalivetimeout,
-	Opt_mount_timeout,
-	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
 	Opt_congestion_kb,
 	Opt_last_int,
 	/* int args above */
-	Opt_fsid,
 	Opt_snapdirname,
-	Opt_name,
-	Opt_secret,
 	Opt_last_string,
 	/* string args above */
-	Opt_ip,
-	Opt_noshare,
 	Opt_dirstat,
 	Opt_nodirstat,
 	Opt_rbytes,
 	Opt_norbytes,
-	Opt_nocrc,
 	Opt_noasyncreaddir,
 };
 
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
-	{Opt_osdtimeout, "osdtimeout=%d"},
-	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-	{Opt_mount_timeout, "mount_timeout=%d"},
-	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
 	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
-	{Opt_fsid, "fsid=%s"},
 	{Opt_snapdirname, "snapdirname=%s"},
-	{Opt_name, "name=%s"},
-	{Opt_secret, "secret=%s"},
 	/* string args above */
-	{Opt_ip, "ip=%s"},
-	{Opt_noshare, "noshare"},
 	{Opt_dirstat, "dirstat"},
 	{Opt_nodirstat, "nodirstat"},
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
-	{Opt_nocrc, "nocrc"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
 	{-1, NULL}
 };
 
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
 {
-	int i = 0;
-	char tmp[3];
-	int err = -EINVAL;
-	int d;
-
-	dout("parse_fsid '%s'\n", str);
-	tmp[2] = 0;
-	while (*str && i < 16) {
-		if (ispunct(*str)) {
-			str++;
-			continue;
+	struct ceph_mount_options *fsopt = private;
+	substring_t argstr[MAX_OPT_ARGS];
+	int token, intval, ret;
+
+	token = match_token((char *)c, fsopt_tokens, argstr);
+	if (token < 0)
+		return -EINVAL;
+
+	if (token < Opt_last_int) {
+		ret = match_int(&argstr[0], &intval);
+		if (ret < 0) {
+			pr_err("bad mount option arg (not int) "
+			       "at '%s'\n", c);
+			return ret;
 		}
-		if (!isxdigit(str[0]) || !isxdigit(str[1]))
-			break;
-		tmp[0] = str[0];
-		tmp[1] = str[1];
-		if (sscanf(tmp, "%x", &d) < 1)
-			break;
-		fsid->fsid[i] = d & 0xff;
-		i++;
-		str += 2;
+		dout("got int token %d val %d\n", token, intval);
+	} else if (token > Opt_last_int && token < Opt_last_string) {
+		dout("got string token %d val %s\n", token,
+		     argstr[0].from);
+	} else {
+		dout("got token %d\n", token);
 	}
 
-	if (i == 16)
-		err = 0;
-	dout("parse_fsid ret %d got fsid %pU", err, fsid);
-	return err;
+	switch (token) {
+	case Opt_snapdirname:
+		kfree(fsopt->snapdir_name);
+		fsopt->snapdir_name = kstrndup(argstr[0].from,
+					       argstr[0].to-argstr[0].from,
+					       GFP_KERNEL);
+		if (!fsopt->snapdir_name)
+			return -ENOMEM;
+		break;
+
+		/* misc */
+	case Opt_wsize:
+		fsopt->wsize = intval;
+		break;
+	case Opt_rsize:
+		fsopt->rsize = intval;
+		break;
+	case Opt_caps_wanted_delay_min:
+		fsopt->caps_wanted_delay_min = intval;
+		break;
+	case Opt_caps_wanted_delay_max:
+		fsopt->caps_wanted_delay_max = intval;
+		break;
+	case Opt_readdir_max_entries:
+		fsopt->max_readdir = intval;
+		break;
+	case Opt_readdir_max_bytes:
+		fsopt->max_readdir_bytes = intval;
+		break;
+	case Opt_congestion_kb:
+		fsopt->congestion_kb = intval;
+		break;
+	case Opt_dirstat:
+		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_nodirstat:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_rbytes:
+		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_norbytes:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_noasyncreaddir:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
+	default:
+		BUG_ON(token);
+	}
+	return 0;
 }
 
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
-						const char *dev_name,
-						const char **path)
+static void destroy_mount_options(struct ceph_mount_options *args)
 {
-	struct ceph_mount_args *args;
-	const char *c;
-	int err = -ENOMEM;
-	substring_t argstr[MAX_OPT_ARGS];
+	dout("destroy_mount_options %p\n", args);
+	kfree(args->snapdir_name);
+	kfree(args);
+}
 
-	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return ERR_PTR(-ENOMEM);
-	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
-				 GFP_KERNEL);
-	if (!args->mon_addr)
-		goto out;
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
 
-	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
-	/* start with defaults */
-	args->sb_flags = flags;
-	args->flags = CEPH_OPT_DEFAULT;
-	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
-	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-	args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-	args->congestion_kb = default_congestion_kb();
-
-	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-	err = -EINVAL;
-	if (!dev_name)
-		goto out;
-	*path = strstr(dev_name, ":/");
-	if (*path == NULL) {
-		pr_err("device name is missing path (no :/ in %s)\n",
-		       dev_name);
-		goto out;
-	}
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+				 struct ceph_options *new_opt,
+				 struct ceph_fs_client *fsc)
+{
+	struct ceph_mount_options *fsopt1 = new_fsopt;
+	struct ceph_mount_options *fsopt2 = fsc->mount_options;
+	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+	int ret;
 
-	/* get mon ip(s) */
-	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
-			     CEPH_MAX_MON, &args->num_mon);
-	if (err < 0)
-		goto out;
+	ret = memcmp(fsopt1, fsopt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+	if (ret)
+		return ret;
+
+	return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+			       struct ceph_options **popt,
+			       int flags, char *options,
+			       const char *dev_name,
+			       const char **path)
+{
+	struct ceph_mount_options *fsopt;
+	const char *dev_name_end;
+	int err = -ENOMEM;
+
+	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+	if (!fsopt)
+		return -ENOMEM;
+
+	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+	
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+	dev_name_end = *path;
+	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 
 	/* path on server */
 	*path += 2;
 	dout("server path '%s'\n", *path);
 
-	/* parse mount options */
-	while ((c = strsep(&options, ",")) != NULL) {
-		int token, intval, ret;
-		if (!*c)
-			continue;
-		err = -EINVAL;
-		token = match_token((char *)c, arg_tokens, argstr);
-		if (token < 0) {
-			pr_err("bad mount option at '%s'\n", c);
-			goto out;
-		}
-		if (token < Opt_last_int) {
-			ret = match_int(&argstr[0], &intval);
-			if (ret < 0) {
-				pr_err("bad mount option arg (not int) "
-				       "at '%s'\n", c);
-				continue;
-			}
-			dout("got int token %d val %d\n", token, intval);
-		} else if (token > Opt_last_int && token < Opt_last_string) {
-			dout("got string token %d val %s\n", token,
-			     argstr[0].from);
-		} else {
-			dout("got token %d\n", token);
-		}
-		switch (token) {
-		case Opt_ip:
-			err = ceph_parse_ips(argstr[0].from,
-					     argstr[0].to,
-					     &args->my_addr,
-					     1, NULL);
-			if (err < 0)
-				goto out;
-			args->flags |= CEPH_OPT_MYIP;
-			break;
-
-		case Opt_fsid:
-			err = parse_fsid(argstr[0].from, &args->fsid);
-			if (err == 0)
-				args->flags |= CEPH_OPT_FSID;
-			break;
-		case Opt_snapdirname:
-			kfree(args->snapdir_name);
-			args->snapdir_name = kstrndup(argstr[0].from,
-					      argstr[0].to-argstr[0].from,
-					      GFP_KERNEL);
-			break;
-		case Opt_name:
-			args->name = kstrndup(argstr[0].from,
-					      argstr[0].to-argstr[0].from,
-					      GFP_KERNEL);
-			break;
-		case Opt_secret:
-			args->secret = kstrndup(argstr[0].from,
-						argstr[0].to-argstr[0].from,
-						GFP_KERNEL);
-			break;
-
-			/* misc */
-		case Opt_wsize:
-			args->wsize = intval;
-			break;
-		case Opt_rsize:
-			args->rsize = intval;
-			break;
-		case Opt_osdtimeout:
-			args->osd_timeout = intval;
-			break;
-		case Opt_osdkeepalivetimeout:
-			args->osd_keepalive_timeout = intval;
-			break;
-		case Opt_osd_idle_ttl:
-			args->osd_idle_ttl = intval;
-			break;
-		case Opt_mount_timeout:
-			args->mount_timeout = intval;
-			break;
-		case Opt_caps_wanted_delay_min:
-			args->caps_wanted_delay_min = intval;
-			break;
-		case Opt_caps_wanted_delay_max:
-			args->caps_wanted_delay_max = intval;
-			break;
-		case Opt_readdir_max_entries:
-			args->max_readdir = intval;
-			break;
-		case Opt_readdir_max_bytes:
-			args->max_readdir_bytes = intval;
-			break;
-		case Opt_congestion_kb:
-			args->congestion_kb = intval;
-			break;
-
-		case Opt_noshare:
-			args->flags |= CEPH_OPT_NOSHARE;
-			break;
-
-		case Opt_dirstat:
-			args->flags |= CEPH_OPT_DIRSTAT;
-			break;
-		case Opt_nodirstat:
-			args->flags &= ~CEPH_OPT_DIRSTAT;
-			break;
-		case Opt_rbytes:
-			args->flags |= CEPH_OPT_RBYTES;
-			break;
-		case Opt_norbytes:
-			args->flags &= ~CEPH_OPT_RBYTES;
-			break;
-		case Opt_nocrc:
-			args->flags |= CEPH_OPT_NOCRC;
-			break;
-		case Opt_noasyncreaddir:
-			args->flags |= CEPH_OPT_NOASYNCREADDIR;
-			break;
-
-		default:
-			BUG_ON(token);
-		}
-	}
-	return args;
+	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+				 parse_fsopt_token, (void *)fsopt);
+	if (err)
+		goto out;
+
+	/* success */
+	*pfsopt = fsopt;
+	return 0;
 
 out:
-	kfree(args->mon_addr);
-	kfree(args);
-	return ERR_PTR(err);
+	destroy_mount_options(fsopt);
+	return err;
 }
 
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-	dout("destroy_mount_args %p\n", args);
-	kfree(args->snapdir_name);
-	args->snapdir_name = NULL;
-	kfree(args->name);
-	args->name = NULL;
-	kfree(args->secret);
-	args->secret = NULL;
-	kfree(args);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_options *fsopt = fsc->mount_options;
+	struct ceph_options *opt = fsc->client->options;
+
+	if (opt->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsid=%pU", &opt->fsid);
+	if (opt->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (opt->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+
+	if (opt->name)
+		seq_printf(m, ",name=%s", opt->name);
+	if (opt->secret)
+		seq_puts(m, ",secret=<hidden>");
+
+	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			   opt->osd_keepalive_timeout);
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+
+	if (fsopt->wsize)
+		seq_printf(m, ",wsize=%d", fsopt->wsize);
+	if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 fsopt->caps_wanted_delay_min);
+	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   fsopt->caps_wanted_delay_max);
+	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   fsopt->cap_release_safety);
+	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+	return 0;
 }
 
 /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
  */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 {
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc = client->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(fsc->mdsc, msg);
+		return 0;
+
+	default:
+		return -1;
+	}
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+					struct ceph_options *opt)
+{
+	struct ceph_fs_client *fsc;
 	int err = -ENOMEM;
 
-	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (client == NULL)
+	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_init(&client->mount_mutex);
-
-	init_waitqueue_head(&client->auth_wq);
+	fsc->client = ceph_create_client(opt, fsc);
+	if (IS_ERR(fsc->client)) {
+		err = PTR_ERR(fsc->client);
+		goto fail;
+	}
+	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+	fsc->client->monc.want_mdsmap = 1;
 
-	client->sb = NULL;
-	client->mount_state = CEPH_MOUNT_MOUNTING;
-	client->mount_args = args;
+	fsc->mount_options = fsopt;
 
-	client->msgr = NULL;
+	fsc->sb = NULL;
+	fsc->mount_state = CEPH_MOUNT_MOUNTING;
 
-	client->auth_err = 0;
-	atomic_long_set(&client->writeback_count, 0);
+	atomic_long_set(&fsc->writeback_count, 0);
 
-	err = bdi_init(&client->backing_dev_info);
+	err = bdi_init(&fsc->backing_dev_info);
 	if (err < 0)
-		goto fail;
+		goto fail_client;
 
 	err = -ENOMEM;
-	client->wb_wq = create_workqueue("ceph-writeback");
-	if (client->wb_wq == NULL)
+	fsc->wb_wq = create_workqueue("ceph-writeback");
+	if (fsc->wb_wq == NULL)
 		goto fail_bdi;
-	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
-	if (client->pg_inv_wq == NULL)
+	fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	if (fsc->pg_inv_wq == NULL)
 		goto fail_wb_wq;
-	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
-	if (client->trunc_wq == NULL)
+	fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	if (fsc->trunc_wq == NULL)
 		goto fail_pg_inv_wq;
 
 	/* set up mempools */
 	err = -ENOMEM;
-	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
-	if (!client->wb_pagevec_pool)
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+	if (!fsc->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
 	/* caps */
-	client->min_caps = args->max_readdir;
+	fsc->min_caps = fsopt->max_readdir;
+
+	return fsc;
 
-	/* subsystems */
-	err = ceph_monc_init(&client->monc, client);
-	if (err < 0)
-		goto fail_mempool;
-	err = ceph_osdc_init(&client->osdc, client);
-	if (err < 0)
-		goto fail_monc;
-	err = ceph_mdsc_init(&client->mdsc, client);
-	if (err < 0)
-		goto fail_osdc;
-	return client;
-
-fail_osdc:
-	ceph_osdc_stop(&client->osdc);
-fail_monc:
-	ceph_monc_stop(&client->monc);
-fail_mempool:
-	mempool_destroy(client->wb_pagevec_pool);
 fail_trunc_wq:
-	destroy_workqueue(client->trunc_wq);
+	destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
-	destroy_workqueue(client->pg_inv_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
-	destroy_workqueue(client->wb_wq);
+	destroy_workqueue(fsc->wb_wq);
 fail_bdi:
-	bdi_destroy(&client->backing_dev_info);
+	bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+	ceph_destroy_client(fsc->client);
 fail:
-	kfree(client);
+	kfree(fsc);
 	return ERR_PTR(err);
 }
 
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
 {
-	dout("destroy_client %p\n", client);
+	dout("destroy_fs_client %p\n", fsc);
 
-	/* unmount */
-	ceph_mdsc_stop(&client->mdsc);
-	ceph_osdc_stop(&client->osdc);
+	destroy_workqueue(fsc->wb_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
+	destroy_workqueue(fsc->trunc_wq);
 
-	/*
-	 * make sure mds and osd connections close out before destroying
-	 * the auth module, which is needed to free those connections'
-	 * ceph_authorizers.
-	 */
-	ceph_msgr_flush();
-
-	ceph_monc_stop(&client->monc);
+	bdi_destroy(&fsc->backing_dev_info);
 
-	ceph_debugfs_client_cleanup(client);
-	destroy_workqueue(client->wb_wq);
-	destroy_workqueue(client->pg_inv_wq);
-	destroy_workqueue(client->trunc_wq);
+	mempool_destroy(fsc->wb_pagevec_pool);
 
-	bdi_destroy(&client->backing_dev_info);
+	destroy_mount_options(fsc->mount_options);
 
-	if (client->msgr)
-		ceph_messenger_destroy(client->msgr);
-	mempool_destroy(client->wb_pagevec_pool);
+	ceph_fs_debugfs_cleanup(fsc);
 
-	destroy_mount_args(client->mount_args);
+	ceph_destroy_client(fsc->client);
 
-	kfree(client);
-	dout("destroy_client %p done\n", client);
+	kfree(fsc);
+	dout("destroy_fs_client %p done\n", fsc);
 }
 
 /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
  */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
 {
-	if (client->have_fsid) {
-		if (ceph_fsid_compare(&client->fsid, fsid)) {
-			pr_err("bad fsid, had %pU got %pU",
-			       &client->fsid, fsid);
-			return -1;
-		}
-	} else {
-		pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
-			fsid);
-		memcpy(&client->fsid, fsid, sizeof(*fsid));
-		ceph_debugfs_client_init(client);
-		client->have_fsid = true;
-	}
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
 	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
 }
 
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+}
+
+
 /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
 {
-	return client->monc.monmap && client->monc.monmap->epoch &&
-	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!fsc)
+		return;
+	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
 }
 
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_sync_fs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
 /*
  * Bootstrap mount by opening the root directory.  Note the mount
  * @started time from caller, and time out if this takes too long.
  */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 				       const char *path,
 				       unsigned long started)
 {
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req = NULL;
 	int err;
 	struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = client->mount_args->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout * HZ;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (err == 0) {
 		dout("open_root_inode success\n");
 		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-		    client->sb->s_root == NULL)
+		    fsc->sb->s_root == NULL)
 			root = d_alloc_root(req->r_target_inode);
 		else
 			root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	return root;
 }
 
+
+
+
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
 		      const char *path)
 {
-	struct ceph_entity_addr *myaddr = NULL;
 	int err;
-	unsigned long timeout = client->mount_args->mount_timeout * HZ;
 	unsigned long started = jiffies;  /* note the start time */
 	struct dentry *root;
+	int first = 0;   /* first vfsmount for this super_block */
 
 	dout("mount start\n");
-	mutex_lock(&client->mount_mutex);
-
-	/* initialize the messenger */
-	if (client->msgr == NULL) {
-		if (ceph_test_opt(client, MYIP))
-			myaddr = &client->mount_args->my_addr;
-		client->msgr = ceph_messenger_create(myaddr);
-		if (IS_ERR(client->msgr)) {
-			err = PTR_ERR(client->msgr);
-			client->msgr = NULL;
-			goto out;
-		}
-		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-	}
+	mutex_lock(&fsc->client->mount_mutex);
 
-	/* open session, and wait for mon, mds, and osd maps */
-	err = ceph_monc_open_session(&client->monc);
+	err = __ceph_open_session(fsc->client, started);
 	if (err < 0)
 		goto out;
 
-	while (!have_mon_and_osd_map(client)) {
-		err = -EIO;
-		if (timeout && time_after_eq(jiffies, started + timeout))
-			goto out;
-
-		/* wait */
-		dout("mount waiting for mon_map\n");
-		err = wait_event_interruptible_timeout(client->auth_wq,
-		       have_mon_and_osd_map(client) || (client->auth_err < 0),
-		       timeout);
-		if (err == -EINTR || err == -ERESTARTSYS)
-			goto out;
-		if (client->auth_err < 0) {
-			err = client->auth_err;
-			goto out;
-		}
-	}
-
 	dout("mount opening root\n");
-	root = open_root_dentry(client, "", started);
+	root = open_root_dentry(fsc, "", started);
 	if (IS_ERR(root)) {
 		err = PTR_ERR(root);
 		goto out;
 	}
-	if (client->sb->s_root)
+	if (fsc->sb->s_root) {
 		dput(root);
-	else
-		client->sb->s_root = root;
+	} else {
+		fsc->sb->s_root = root;
+		first = 1;
+
+		err = ceph_fs_debugfs_init(fsc);
+		if (err < 0)
+			goto fail;
+	}
 
 	if (path[0] == 0) {
 		dget(root);
 	} else {
 		dout("mount opening base mountpoint\n");
-		root = open_root_dentry(client, path, started);
+		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
-			dput(client->sb->s_root);
-			client->sb->s_root = NULL;
-			goto out;
+			goto fail;
 		}
 	}
 
 	mnt->mnt_root = root;
-	mnt->mnt_sb = client->sb;
+	mnt->mnt_sb = fsc->sb;
 
-	client->mount_state = CEPH_MOUNT_MOUNTED;
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
 	dout("mount success\n");
 	err = 0;
 
 out:
-	mutex_unlock(&client->mount_mutex);
+	mutex_unlock(&fsc->client->mount_mutex);
 	return err;
+
+fail:
+	if (first) {
+		dput(fsc->sb->s_root);
+		fsc->sb->s_root = NULL;
+	}
+	goto out;
 }
 
 static int ceph_set_super(struct super_block *s, void *data)
 {
-	struct ceph_client *client = data;
+	struct ceph_fs_client *fsc = data;
 	int ret;
 
 	dout("set_super %p data %p\n", s, data);
 
-	s->s_flags = client->mount_args->sb_flags;
+	s->s_flags = fsc->mount_options->sb_flags;
 	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 
-	s->s_fs_info = client;
-	client->sb = s;
+	s->s_fs_info = fsc;
+	fsc->sb = s;
 
 	s->s_op = &ceph_super_ops;
 	s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 
 fail:
 	s->s_fs_info = NULL;
-	client->sb = NULL;
+	fsc->sb = NULL;
 	return ret;
 }
 
@@ -926,30 +732,23 @@ fail:
  */
 static int ceph_compare_super(struct super_block *sb, void *data)
 {
-	struct ceph_client *new = data;
-	struct ceph_mount_args *args = new->mount_args;
-	struct ceph_client *other = ceph_sb_to_client(sb);
-	int i;
+	struct ceph_fs_client *new = data;
+	struct ceph_mount_options *fsopt = new->mount_options;
+	struct ceph_options *opt = new->client->options;
+	struct ceph_fs_client *other = ceph_sb_to_client(sb);
 
 	dout("ceph_compare_super %p\n", sb);
-	if (args->flags & CEPH_OPT_FSID) {
-		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
-			dout("fsid doesn't match\n");
-			return 0;
-		}
-	} else {
-		/* do we share (a) monitor? */
-		for (i = 0; i < new->monc.monmap->num_mon; i++)
-			if (ceph_monmap_contains(other->monc.monmap,
-					 &new->monc.monmap->mon_inst[i].addr))
-				break;
-		if (i == new->monc.monmap->num_mon) {
-			dout("mon ip not part of monmap\n");
-			return 0;
-		}
-		dout("mon ip matches existing sb %p\n", sb);
+
+	if (compare_mount_options(fsopt, opt, other)) {
+		dout("monitor(s)/mount options don't match\n");
+		return 0;
 	}
-	if (args->sb_flags != other->mount_args->sb_flags) {
+	if ((opt->flags & CEPH_OPT_FSID) &&
+	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+		dout("fsid doesn't match\n");
+		return 0;
+	}
+	if (fsopt->sb_flags != other->mount_options->sb_flags) {
 		dout("flags differ\n");
 		return 0;
 	}
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
  */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+			     struct ceph_fs_client *fsc)
 {
 	int err;
 
 	/* set ra_pages based on rsize mount option? */
-	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
-		client->backing_dev_info.ra_pages =
-			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		fsc->backing_dev_info.ra_pages =
+			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-	err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
 			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
-		sb->s_bdi = &client->backing_dev_info;
+		sb->s_bdi = &fsc->backing_dev_info;
 	return err;
 }
 
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		       struct vfsmount *mnt)
 {
 	struct super_block *sb;
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 	const char *path = NULL;
-	struct ceph_mount_args *args;
+	struct ceph_mount_options *fsopt = NULL;
+	struct ceph_options *opt = NULL;
 
 	dout("ceph_get_sb\n");
-	args = parse_mount_args(flags, data, dev_name, &path);
-	if (IS_ERR(args)) {
-		err = PTR_ERR(args);
+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+	if (err < 0)
 		goto out_final;
-	}
 
 	/* create client (which we may/may not use) */
-	client = ceph_create_client(args);
-	if (IS_ERR(client)) {
-		err = PTR_ERR(client);
+	fsc = create_fs_client(fsopt, opt);
+	if (IS_ERR(fsc)) {
+		err = PTR_ERR(fsc);
+		kfree(fsopt);
+		kfree(opt);
 		goto out_final;
 	}
 
-	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+	err = ceph_mdsc_init(fsc);
+	if (err < 0)
+		goto out;
+
+	if (ceph_test_opt(fsc->client, NOSHARE))
 		compare_super = NULL;
-	sb = sget(fs_type, compare_super, ceph_set_super, client);
+	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
 	if (IS_ERR(sb)) {
 		err = PTR_ERR(sb);
 		goto out;
 	}
 
-	if (ceph_sb_to_client(sb) != client) {
-		ceph_destroy_client(client);
-		client = ceph_sb_to_client(sb);
-		dout("get_sb got existing client %p\n", client);
+	if (ceph_sb_to_client(sb) != fsc) {
+		ceph_mdsc_destroy(fsc);
+		destroy_fs_client(fsc);
+		fsc = ceph_sb_to_client(sb);
+		dout("get_sb got existing client %p\n", fsc);
 	} else {
-		dout("get_sb using new client %p\n", client);
-		err = ceph_register_bdi(sb, client);
+		dout("get_sb using new client %p\n", fsc);
+		err = ceph_register_bdi(sb, fsc);
 		if (err < 0)
 			goto out_splat;
 	}
 
-	err = ceph_mount(client, mnt, path);
+	err = ceph_mount(fsc, mnt, path);
 	if (err < 0)
 		goto out_splat;
 	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 	return 0;
 
 out_splat:
-	ceph_mdsc_close_sessions(&client->mdsc);
+	ceph_mdsc_close_sessions(fsc->mdsc);
 	deactivate_locked_super(sb);
 	goto out_final;
 
 out:
-	ceph_destroy_client(client);
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
 out_final:
 	dout("ceph_get_sb fail %d\n", err);
 	return err;
@@ -1042,11 +849,12 @@ out_final:
 
 static void ceph_kill_sb(struct super_block *s)
 {
-	struct ceph_client *client = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 	dout("kill_sb %p\n", s);
-	ceph_mdsc_pre_umount(&client->mdsc);
+	ceph_mdsc_pre_umount(fsc->mdsc);
 	kill_anon_super(s);    /* will call put_super after sb is r/o */
-	ceph_destroy_client(client);
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
 }
 
 static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
 
 static int __init init_ceph(void)
 {
-	int ret = 0;
-
-	ret = ceph_debugfs_init();
-	if (ret < 0)
-		goto out;
-
-	ret = ceph_msgr_init();
-	if (ret < 0)
-		goto out_debugfs;
-
-	ret = init_caches();
+	int ret = init_caches();
 	if (ret)
-		goto out_msgr;
+		goto out;
 
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
 		goto out_icache;
 
-	pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
 	return 0;
 
 out_icache:
 	destroy_caches();
-out_msgr:
-	ceph_msgr_exit();
-out_debugfs:
-	ceph_debugfs_cleanup();
 out:
 	return ret;
 }
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
 	destroy_caches();
-	ceph_msgr_exit();
-	ceph_debugfs_cleanup();
 }
 
 module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..e2e904442ce2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
 #ifndef _FS_CEPH_SUPER_H
 #define _FS_CEPH_SUPER_H
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <asm/unaligned.h>
 #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
 
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include <linux/ceph/libceph.h>
 
 /* f_type in struct statfs */
 #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
 #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
 
-/*
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
 
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+	(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define ceph_set_opt(client, opt) \
-	(client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
-	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
 
-
-struct ceph_mount_args {
-	int sb_flags;
+struct ceph_mount_options {
 	int flags;
-	struct ceph_fsid fsid;
-	struct ceph_entity_addr my_addr;
-	int num_mon;
-	struct ceph_entity_addr *mon_addr;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int osd_timeout;
-	int osd_keepalive_timeout;
+	int sb_flags;
+
 	int wsize;
 	int rsize;            /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
 	int cap_release_safety;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
-	char *snapdir_name;   /* default ".snap" */
-	char *name;
-	char *secret;
-};
-
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
-	CEPH_MOUNT_MOUNTING,
-	CEPH_MOUNT_MOUNTED,
-	CEPH_MOUNT_UNMOUNTING,
-	CEPH_MOUNT_UNMOUNTED,
-	CEPH_MOUNT_SHUTDOWN,
-};
 
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-	BUG_ON(time_after(b, a));
-	return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-	struct ceph_fsid fsid;
-	bool have_fsid;
+	/*
+	 * everything above this point can be memcmp'd; everything below
+	 * is handled in compare_mount_options()
+	 */
 
-	struct mutex mount_mutex;       /* serialize mount attempts */
-	struct ceph_mount_args *mount_args;
+	char *snapdir_name;   /* default ".snap" */
+};
 
+struct ceph_fs_client {
 	struct super_block *sb;
 
-	unsigned long mount_state;
-	wait_queue_head_t auth_wq;
-
-	int auth_err;
+	struct ceph_mount_options *mount_options;
+	struct ceph_client *client;
 
+	unsigned long mount_state;
 	int min_caps;                  /* min caps i added */
 
-	struct ceph_messenger *msgr;   /* messenger instance */
-	struct ceph_mon_client monc;
-	struct ceph_mds_client mdsc;
-	struct ceph_osd_client osdc;
+	struct ceph_mds_client *mdsc;
 
 	/* writeback */
 	mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
 	struct backing_dev_info backing_dev_info;
 
 #ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_monmap;
-	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_dentry_lru, *debugfs_caps;
 	struct dentry *debugfs_congestion_kb;
 	struct dentry *debugfs_bdi;
+	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 #endif
 };
 
+
 /*
  * File i/o capability.  This tracks shared state with the metadata
  * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
 	int should_free_val;
 };
 
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
 struct ceph_inode_xattrs_info {
 	/*
 	 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
 /*
  * Ceph inode.
  */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
-
 struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
 
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
 	return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
 
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	if (!ino)
+		ino = 1;
+#endif
+	return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
 static inline void ceph_i_clear(struct inode *inode, unsigned mask)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -432,20 +418,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 			    struct ceph_inode_frag *pfrag,
 			    int *found);
 
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-	struct ceph_mds_session *lease_session;
-	u32 lease_gen, lease_shared_gen;
-	u32 lease_seq;
-	unsigned long lease_renew_after, lease_renew_from;
-	struct list_head lru;
-	struct dentry *dentry;
-	u64 time;
-	u64 offset;
-};
-
 static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 {
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +428,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 	return ((loff_t)frag << 32) | (loff_t)off;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-	if (!ino)
-		ino = 1;
-#endif
-	return ino;
-}
-
 static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 {
 	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +435,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 	return 0;
 }
 
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-	struct ceph_vino *pvino = (struct ceph_vino *)data;
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	return ci->i_vino.ino == pvino->ino &&
-		ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-					    struct ceph_vino vino)
-{
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
 /*
  * caps helpers
  */
@@ -576,18 +499,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 			       struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
 				    int *total, int *avail, int *used,
 				    int *reserved, int *min);
 
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
 {
-	return (struct ceph_client *)inode->i_sb->s_fs_info;
+	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
 }
 
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
 {
-	return (struct ceph_client *)sb->s_fs_info;
+	return (struct ceph_fs_client *)sb->s_fs_info;
 }
 
 
@@ -616,51 +539,6 @@ struct ceph_file_info {
 
 
-/*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-	atomic_t nref;
-	u64 seq;
-	int num_snaps;
-	u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-	/*
-	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)+1);
-	*/
-	if (sc)
-		atomic_inc(&sc->nref);
-	return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-	if (!sc)
-		return;
-	/*
-	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)-1);
-	*/
-	if (atomic_dec_and_test(&sc->nref)) {
-		/*printk(" deleting snap_context %p\n", sc);*/
-		kfree(sc);
-	}
-}
-
 /*
  * A "snap realm" describes a subset of the file hierarchy sharing
  * the same set of snapshots that apply to it.  The realms themselves
@@ -699,16 +577,33 @@ struct ceph_snap_realm {
 	spinlock_t inodes_with_caps_lock;
 };
 
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
 {
-	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
-		(off >> PAGE_CACHE_SHIFT);
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
 }
 
 
@@ -741,16 +636,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
 			   ci_item)->writing;
 }
 
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
 
@@ -857,12 +742,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 				       struct nameidata *nd, int mode,
 				       int locked_dir);
 extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
 
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
@@ -892,12 +783,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* export.c */
 extern const struct export_operations ceph_export_ops;
 
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
 /* locks.c */
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +799,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
 	return NULL;
 }
 
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
 #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-#include "ceph_hash.h"
-
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
-	u64 ino;
-	u64 snap;
-};
-
-
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
-	int count;
-};
-
-
-#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..70e919960acb 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
 
 #include <linux/xattr.h>
 #include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			      const char *value, size_t size, int flags)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	int err;
 	int i, nr_pages;
 	struct page **pages = NULL;
@@ -777,8 +780,8 @@ out:
 
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
new file mode 100644
index 000000000000..7fff521d7eb5
--- /dev/null
+++ b/include/linux/ceph/auth.h
@@ -0,0 +1,92 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+	const char *name;
+
+	/*
+	 * true if we are authenticated and can connect to
+	 * services.
+	 */
+	int (*is_authenticated)(struct ceph_auth_client *ac);
+
+	/*
+	 * true if we should (re)authenticate, e.g., when our tickets
+	 * are getting old and crusty.
+	 */
+	int (*should_authenticate)(struct ceph_auth_client *ac);
+
+	/*
+	 * build requests and process replies during monitor
+	 * handshake.  if handle_reply returns -EAGAIN, we build
+	 * another request.
+	 */
+	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+	int (*handle_reply)(struct ceph_auth_client *ac, int result,
+			    void *buf, void *end);
+
+	/*
+	 * Create authorizer for connecting to a service, and verify
+	 * the response to authenticate the service.
+	 */
+	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+				 struct ceph_authorizer **a,
+				 void **buf, size_t *len,
+				 void **reply_buf, size_t *reply_len);
+	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+				       struct ceph_authorizer *a, size_t len);
+	void (*destroy_authorizer)(struct ceph_auth_client *ac,
+				   struct ceph_authorizer *a);
+	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+				      int peer_type);
+
+	/* reset when we (re)connect to a monitor */
+	void (*reset)(struct ceph_auth_client *ac);
+
+	void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+	u32 protocol;           /* CEPH_AUTH_* */
+	void *private;          /* for use by protocol implementation */
+	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+
+	bool negotiating;       /* true if negotiating protocol */
+	const char *name;       /* entity name */
+	u64 global_id;          /* our unique id in system */
+	const char *secret;     /* our secret key */
+	unsigned want_keys;     /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+					       const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+				 void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+				  void *buf, size_t len,
+				  void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+#endif
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/include/linux/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+	struct kref kref;
+	struct kvec vec;
+	size_t alloc_len;
+	bool is_vmalloc;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+	kref_get(&b->kref);
+	return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+	kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
new file mode 100644
index 000000000000..aa2e19182d99
--- /dev/null
+++ b/include/linux/ceph/ceph_debug.h
@@ -0,0 +1,38 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)						\
+	pr_debug("%.*s %12.12s:%-4d : " fmt,				\
+		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
+		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
+		 __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)	do {				\
+		if (0)						\
+			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+	} while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
new file mode 100644
index 000000000000..5babb8e95352
--- /dev/null
+++ b/include/linux/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+	return (b << 24) |
+		(v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+	return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+	return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+	return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+	/* is sub as specific as us, and contained by us? */
+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f) - 1,
+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1,
+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+	int newbits = ceph_frag_bits(f) + by;
+	return ceph_frag_make(newbits,
+			 ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+	return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+	return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
new file mode 100644
index 000000000000..d5619ac86711
--- /dev/null
+++ b/include/linux/ceph/ceph_fs.h
@@ -0,0 +1,728 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_UID            (1<<0)
+#define CEPH_FEATURE_NOSRCADDR      (1<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
+#define CEPH_FEATURE_FLOCK          (1<<3)
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	__le32 fl_stripe_count;    /* over this many objects */
+	__le32 fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+
+	/* object -> pg layout */
+	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN	0x0
+#define CEPH_AUTH_NONE	 	0x1
+#define CEPH_AUTH_CEPHX	 	0x2
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH			17
+#define CEPH_MSG_AUTH_REPLY		18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+
+/* pool operations */
+enum {
+  POOL_OP_CREATE			= 0x01,
+  POOL_OP_DELETE			= 0x02,
+  POOL_OP_AUID_CHANGE			= 0x03,
+  POOL_OP_CREATE_SNAP			= 0x11,
+  POOL_OP_DELETE_SNAP			= 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
+};
+
+struct ceph_mon_request_header {
+	__le64 have_version;
+	__le16 session_mon;
+	__le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+	__le64 kb, kb_used, kb_avail;
+	__le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+	struct ceph_fsid fsid;
+	__le64 version;
+	struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 pool;
+	__le32 op;
+	__le64 auid;
+	__le64 snapid;
+	__le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 reply_code;
+	__le32 epoch;
+	char has_data;
+	char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+	__le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+	struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+	__le64 have_version;	__le64 have;
+	__u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+	__le32 duration;         /* seconds */
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+					  empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+					  operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION    1
+#define CEPH_LOCK_DN          2
+#define CEPH_LOCK_ISNAP       16
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
+#define CEPH_LOCK_IFILE       64
+#define CEPH_LOCK_IAUTH       128
+#define CEPH_LOCK_ILINK       256
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
+#define CEPH_LOCK_INEST       1024  /* mds internal */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+	CEPH_SESSION_REQUEST_OPEN,
+	CEPH_SESSION_OPEN,
+	CEPH_SESSION_REQUEST_CLOSE,
+	CEPH_SESSION_CLOSE,
+	CEPH_SESSION_REQUEST_RENEWCAPS,
+	CEPH_SESSION_RENEWCAPS,
+	CEPH_SESSION_STALE,
+	CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+	__le32 op;
+	__le64 seq;
+	struct ceph_timespec stamp;
+	__le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+	CEPH_MDS_OP_LOOKUP     = 0x00100,
+	CEPH_MDS_OP_GETATTR    = 0x00101,
+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+	CEPH_MDS_OP_SETXATTR   = 0x01105,
+	CEPH_MDS_OP_RMXATTR    = 0x01106,
+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+	CEPH_MDS_OP_SETATTR    = 0x01108,
+	CEPH_MDS_OP_SETFILELOCK= 0x01109,
+	CEPH_MDS_OP_GETFILELOCK= 0x00110,
+
+	CEPH_MDS_OP_MKNOD      = 0x01201,
+	CEPH_MDS_OP_LINK       = 0x01202,
+	CEPH_MDS_OP_UNLINK     = 0x01203,
+	CEPH_MDS_OP_RENAME     = 0x01204,
+	CEPH_MDS_OP_MKDIR      = 0x01220,
+	CEPH_MDS_OP_RMDIR      = 0x01221,
+	CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+	CEPH_MDS_OP_CREATE     = 0x01301,
+	CEPH_MDS_OP_OPEN       = 0x00302,
+	CEPH_MDS_OP_READDIR    = 0x00305,
+
+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+	CEPH_MDS_OP_MKSNAP     = 0x01400,
+	CEPH_MDS_OP_RMSNAP     = 0x01401,
+	CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 file_replication;
+		__le32 preferred;
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+	struct {
+		__u8 rule; /* currently fcntl or flock */
+		__u8 type; /* shared, exclusive, remove*/
+		__le64 pid; /* process id requesting the lock */
+		__le64 pid_namespace;
+		__le64 start; /* initial location to lock */
+		__le64 length; /* num bytes to lock from start */
+		__u8 wait; /* will caller wait for lock to become available? */
+	} __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head {
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+	__le64 ino, cap_id;            /* ino and unique cap id */
+	__le32 caps, wanted;           /* new issued, wanted */
+	__le32 seq, issue_seq, mseq;
+	__le32 dname_seq;              /* if releasing a dentry lease, a */
+	__le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+	__le32 op;
+	__le32 result;
+	__le32 mdsmap_epoch;
+	__u8 safe;                     /* true if committed to disk */
+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
+					  are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+	__le32 frag;                   /* this frag splits... */
+	__le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+	__le32 nsplits;                /* num ceph_frag_tree_split records */
+	struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+	__le32 caps, wanted;           /* caps issued, wanted */
+	__le64 cap_id;
+	__le32 seq, mseq;
+	__le64 realm;                  /* snap realm */
+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+	__le64 ino;
+	__le64 snapid;
+	__le32 rdev;
+	__le64 version;                /* inode version */
+	__le64 xattr_version;          /* version for xattr blob */
+	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+	struct ceph_file_layout layout;
+	struct ceph_timespec ctime, mtime, atime;
+	__le32 time_warp_seq;
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	__le32 mode, uid, gid;
+	__le32 nlink;
+	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+	struct ceph_timespec rctime;
+	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+	__le16 mask;            /* lease type(s) */
+	__le32 duration_ms;     /* lease duration */
+	__le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+	__le32 frag;            /* fragment */
+	__le32 auth;            /* auth mds, if this is a delegation point */
+	__le32 ndist;           /* number of mds' this is replicated on */
+	__le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL    1
+#define CEPH_LOCK_FLOCK    2
+
+#define CEPH_LOCK_SHARED   1
+#define CEPH_LOCK_EXCL     2
+#define CEPH_LOCK_UNLOCK   4
+
+struct ceph_filelock {
+	__le64 start;/* file offset to start lock at */
+	__le64 length; /* num bytes to lock; 0 for all following start */
+	__le64 client; /* which client holds the lock */
+	__le64 pid; /* process id holding the lock on the client */
+	__le64 pid_namespace;
+	__u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8
+#define CEPH_CAP_SFLOCK    20 
+
+#define CEPH_CAP_BITS       22
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
+#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
+
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
+				 CEPH_CAP_AUTH_SHARED |	\
+				 CEPH_CAP_LINK_SHARED |	\
+				 CEPH_CAP_FILE_SHARED |	\
+				 CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
+			      CEPH_CAP_LINK_SHARED |			\
+			      CEPH_CAP_XATTR_SHARED |			\
+			      CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
+			   CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
+			   CEPH_CAP_LINK_EXCL |		\
+			   CEPH_CAP_XATTR_EXCL |	\
+			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
+			      CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+			   CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+			CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+	__le32 op;                  /* CEPH_CAP_OP_* */
+	__le64 ino, realm;
+	__le64 cap_id;
+	__le32 seq, issue_seq;
+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+	__le32 migrate_seq;
+	__le64 snap_follows;
+	__le32 snap_trace_len;
+
+	/* authlock */
+	__le32 uid, gid, mode;
+
+	/* linklock */
+	__le32 nlink;
+
+	/* xattrlock */
+	__le32 xattr_len;
+	__le64 xattr_version;
+
+	/* filelock */
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	struct ceph_timespec mtime, atime, ctime;
+	struct ceph_file_layout layout;
+	__le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+	__le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+	__le64 ino;
+	__le64 cap_id;
+	__le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+	__u8 action;            /* CEPH_MDS_LEASE_* */
+	__le16 mask;            /* which lease */
+	__le64 ino;
+	__le64 first, last;     /* snap range */
+	__le32 seq;
+	__le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+	__le32 flock_len;       /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 size;
+	struct ceph_timespec mtime, atime;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+	__le64 ino;     /* snap realm base */
+	__le64 seq;     /* snap seq for this snap realm */
+	__le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+	CEPH_SNAP_OP_CREATE,
+	CEPH_SNAP_OP_DESTROY,
+	CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+	__le32 op;                /* CEPH_SNAP_OP_* */
+	__le64 split;             /* ino to split off, if any */
+	__le32 num_split_inos;    /* # inos belonging to new child realm */
+	__le32 num_split_realms;  /* # child realms udner new child realm */
+	__le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+	__le64 ino;           /* ino */
+	__le64 created;       /* snap: when created */
+	__le64 parent;        /* ino: parent realm */
+	__le64 parent_since;  /* snap: same parent since */
+	__le64 seq;           /* snap: version */
+	__le32 num_snaps;
+	__le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
new file mode 100644
index 000000000000..d099c3f90236
--- /dev/null
+++ b/include/linux/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
new file mode 100644
index 000000000000..2a79702e092b
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
+#ifndef _FS_CEPH_DEBUGFS_H
+#define _FS_CEPH_DEBUGFS_H
+
+#include "ceph_debug.h"
+#include "types.h"
+
+#define CEPH_DEFINE_SHOW_FUNC(name)					\
+static int name##_open(struct inode *inode, struct file *file)		\
+{									\
+	struct seq_file *sf;						\
+	int ret;							\
+									\
+	ret = single_open(file, name, NULL);				\
+	sf = file->private_data;					\
+	sf->private = inode->i_private;					\
+	return ret;							\
+}									\
+									\
+static const struct file_operations name##_fops = {			\
+	.open		= name##_open,					\
+	.read		= seq_read,					\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+};
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+#endif
+
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
new file mode 100644
index 000000000000..c5b6939fb32a
--- /dev/null
+++ b/include/linux/ceph/decode.h
@@ -0,0 +1,201 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+#include <linux/time.h>
+
+#include "types.h"
+
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+	u64 v = get_unaligned_le64(*p);
+	*p += sizeof(u64);
+	return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+	u32 v = get_unaligned_le32(*p);
+	*p += sizeof(u32);
+	return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+	u16 v = get_unaligned_le16(*p);
+	*p += sizeof(u16);
+	return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+	u8 v = *(u8 *)*p;
+	(*p)++;
+	return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+	memcpy(pv, *p, n);
+	*p += n;
+}
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u64), bad);	\
+		v = ceph_decode_64(p);				\
+	} while (0)
+#define ceph_decode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u32), bad);	\
+		v = ceph_decode_32(p);				\
+	} while (0)
+#define ceph_decode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u16), bad);	\
+		v = ceph_decode_16(p);				\
+	} while (0)
+#define ceph_decode_8_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u8), bad);	\
+		v = ceph_decode_8(p);				\
+	} while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_decode_need(p, end, n, bad);		\
+		ceph_decode_copy(p, pv, n);			\
+	} while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+					const struct ceph_timespec *tv)
+{
+	ts->tv_sec = le32_to_cpu(tv->tv_sec);
+	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+					const struct timespec *ts)
+{
+	tv->tv_sec = cpu_to_le32(ts->tv_sec);
+	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+	__be16 ss_family = htons(a->in_addr.ss_family);
+	a->in_addr.ss_family = *(__u16 *)&ss_family;
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
+	a->in_addr.ss_family = ntohs(ss_family);
+	WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+	put_unaligned_le64(v, (__le64 *)*p);
+	*p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+	put_unaligned_le32(v, (__le32 *)*p);
+	*p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+	put_unaligned_le16(v, (__le16 *)*p);
+	*p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+	*(u8 *)*p = v;
+	(*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+	memcpy(*p, s, len);
+	*p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+					u64 ino, const char *path)
+{
+	u32 len = path ? strlen(path) : 0;
+	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, ino);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, path, len);
+	*p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+				      const char *s, u32 len)
+{
+	BUG_ON(*p + sizeof(len) + len > end);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, s, len);
+	*p += len;
+}
+
+#define ceph_encode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u64), bad);	\
+		ceph_encode_64(p, v);				\
+	} while (0)
+#define ceph_encode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u32), bad);	\
+		ceph_encode_32(p, v);			\
+	} while (0)
+#define ceph_encode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u16), bad);	\
+		ceph_encode_16(p, v);			\
+	} while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_copy(p, pv, n);			\
+	} while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_string(p, end, s, n);		\
+	} while (0)
+
+
+#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
new file mode 100644
index 000000000000..f22b2e941686
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
+#ifndef _FS_CEPH_LIBCEPH_H
+#define _FS_CEPH_LIBCEPH_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/*
+ * Supported features
+ */
+#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+
+#define CEPH_OPT_DEFAULT   (0);
+
+#define ceph_set_opt(client, opt) \
+	(client)->options->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+	(!!((client)->options->flags & CEPH_OPT_##opt))
+
+struct ceph_options {
+	int flags;
+	struct ceph_fsid fsid;
+	struct ceph_entity_addr my_addr;
+	int mount_timeout;
+	int osd_idle_ttl;
+	int osd_timeout;
+	int osd_keepalive_timeout;
+
+	/*
+	 * any type that can't be simply compared or doesn't need need
+	 * to be compared should go beyond this point,
+	 * ceph_compare_options() should be updated accordingly
+	 */
+
+	struct ceph_entity_addr *mon_addr; /* should be the first
+					      pointer type of args */
+	int num_mon;
+	char *name;
+	char *secret;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
+
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
+
+/* mount state */
+enum {
+	CEPH_MOUNT_MOUNTING,
+	CEPH_MOUNT_MOUNTED,
+	CEPH_MOUNT_UNMOUNTING,
+	CEPH_MOUNT_UNMOUNTED,
+	CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+	BUG_ON(time_after(b, a));
+	return (long)a - (long)b;
+}
+
+struct ceph_mds_client;
+
+/*
+ * per client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+	struct ceph_fsid fsid;
+	bool have_fsid;
+
+	void *private;
+
+	struct ceph_options *options;
+
+	struct mutex mount_mutex;      /* serialize mount attempts */
+	wait_queue_head_t auth_wq;
+	int auth_err;
+
+	int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
+
+	u32 supported_features;
+	u32 required_features;
+
+	struct ceph_messenger *msgr;   /* messenger instance */
+	struct ceph_mon_client monc;
+	struct ceph_osd_client osdc;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_monmap;
+	struct dentry *debugfs_osdmap;
+#endif
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+	atomic_t nref;
+	u64 seq;
+	int num_snaps;
+	u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	/*
+	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)+1);
+	*/
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	/*
+	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)-1);
+	*/
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+		(off >> PAGE_CACHE_SHIFT);
+}
+
+/* ceph_common.c */
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern int ceph_parse_options(struct ceph_options **popt, char *options,
+			      const char *dev_name, const char *dev_name_end,
+			      int (*parse_extra_token)(char *c, void *private),
+			      void *private);
+extern void ceph_destroy_options(struct ceph_options *opt);
+extern int ceph_compare_options(struct ceph_options *new_opt,
+				struct ceph_client *client);
+extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
+					      void *private);
+extern u64 ceph_client_id(struct ceph_client *client);
+extern void ceph_destroy_client(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client,
+			       unsigned long started);
+extern int ceph_open_session(struct ceph_client *client);
+
+/* pagevec.c */
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+extern struct page **ceph_get_direct_page_vector(const char __user *data,
+					    int num_pages,
+					    loff_t off, size_t len);
+extern void ceph_put_page_vector(struct page **pages, int num_pages);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_copy_user_to_page_vector(struct page **pages,
+					 const char __user *data,
+					 loff_t off, size_t len);
+extern int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
+				    loff_t off, size_t len);
+extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
+
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
new file mode 100644
index 000000000000..4c5cb0880bba
--- /dev/null
+++ b/include/linux/ceph/mdsmap.h
@@ -0,0 +1,62 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include "types.h"
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+	u64 global_id;
+	struct ceph_entity_addr addr;
+	s32 state;
+	int num_export_targets;
+	bool laggy;
+	u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+	u32 m_epoch, m_client_epoch, m_last_failure;
+	u32 m_root;
+	u32 m_session_timeout;          /* seconds */
+	u32 m_session_autoclose;        /* seconds */
+	u64 m_max_file_size;
+	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+	struct ceph_mds_info *m_info;
+
+	/* which object pools file data can be stored in */
+	int m_num_data_pg_pools;
+	u32 *m_data_pg_pools;
+	u32 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+	if (w >= m->m_max_mds)
+		return NULL;
+	return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+	BUG_ON(w < 0);
+	if (w >= m->m_max_mds)
+		return CEPH_MDS_STATE_DNE;
+	return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+	if (w >= 0 && w < m->m_max_mds)
+		return m->m_info[w].laggy;
+	return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
new file mode 100644
index 000000000000..5956d62c3057
--- /dev/null
+++ b/include/linux/ceph/messenger.h
@@ -0,0 +1,261 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+	struct ceph_connection *(*get)(struct ceph_connection *);
+	void (*put)(struct ceph_connection *);
+
+	/* handle an incoming message. */
+	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+	/* authorize an outgoing connection */
+	int (*get_authorizer) (struct ceph_connection *con,
+			       void **buf, int *len, int *proto,
+			       void **reply_buf, int *reply_len, int force_new);
+	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+	int (*invalidate_authorizer)(struct ceph_connection *con);
+
+	/* protocol version mismatch */
+	void (*bad_proto) (struct ceph_connection *con);
+
+	/* there was some error on the socket (disconnect, whatever) */
+	void (*fault) (struct ceph_connection *con);
+
+	/* a remote host as terminated a message exchange session, and messages
+	 * we sent (or they tried to send us) may be lost. */
+	void (*peer_reset) (struct ceph_connection *con);
+
+	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+					struct ceph_msg_header *hdr,
+					int *skip);
+};
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+	struct ceph_entity_inst inst;    /* my name+address */
+	struct ceph_entity_addr my_enc_addr;
+	struct page *zero_page;          /* used in certain error cases */
+
+	bool nocrc;
+
+	/*
+	 * the global_seq counts connections i (attempt to) initiate
+	 * in order to disambiguate certain connect race conditions.
+	 */
+	u32 global_seq;
+	spinlock_t global_seq_lock;
+
+	u32 supported_features;
+	u32 required_features;
+};
+
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+	struct ceph_msg_header hdr;	/* header */
+	struct ceph_msg_footer footer;	/* footer */
+	struct kvec front;              /* unaligned blobs of message */
+	struct ceph_buffer *middle;
+	struct page **pages;            /* data payload.  NOT OWNER. */
+	unsigned nr_pages;              /* size of page array */
+	struct ceph_pagelist *pagelist; /* instead of pages */
+	struct list_head list_head;
+	struct kref kref;
+	struct bio  *bio;		/* instead of pages/pagelist */
+	struct bio  *bio_iter;		/* bio iterator */
+	int bio_seg;			/* current bio segment */
+	struct ceph_pagelist *trail;	/* the trailing part of the data */
+	bool front_is_vmalloc;
+	bool more_to_follow;
+	bool needs_out_seq;
+	int front_max;
+
+	struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+	int page, page_pos;  /* which page; offset in page */
+	int data_pos;        /* offset in data payload */
+	int did_page_crc;    /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL	(HZ/2)
+#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define CONNECTING	1
+#define NEGOTIATING	2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING	4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
+			    * the ceph_connection around to maintain shared
+			    * state with the peer. */
+#define CLOSED		10 /* we've closed the connection */
+#define SOCK_CLOSED	11 /* socket state changed to closed */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+	void *private;
+	atomic_t nref;
+
+	const struct ceph_connection_operations *ops;
+
+	struct ceph_messenger *msgr;
+	struct socket *sock;
+	unsigned long state;	/* connection state (see flags above) */
+	const char *error_msg;  /* error message, if any */
+
+	struct ceph_entity_addr peer_addr; /* peer address */
+	struct ceph_entity_name peer_name; /* peer name */
+	struct ceph_entity_addr peer_addr_for_me;
+	unsigned peer_features;
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this connection, client */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+
+	int auth_retry;       /* true if we need a newer authorizer */
+	void *auth_reply_buf;   /* where to put the authorizer reply */
+	int auth_reply_buf_len;
+
+	struct mutex mutex;
+
+	/* out queue */
+	struct list_head out_queue;
+	struct list_head out_sent;   /* sending or sent but unacked */
+	u64 out_seq;		     /* last message queued for send */
+	bool out_keepalive_pending;
+
+	u64 in_seq, in_seq_acked;  /* last message received, acked */
+
+	/* connection negotiation temps */
+	char in_banner[CEPH_BANNER_MAX_LEN];
+	union {
+		struct {  /* outgoing connection */
+			struct ceph_msg_connect out_connect;
+			struct ceph_msg_connect_reply in_reply;
+		};
+		struct {  /* incoming */
+			struct ceph_msg_connect in_connect;
+			struct ceph_msg_connect_reply out_reply;
+		};
+	};
+	struct ceph_entity_addr actual_peer_addr;
+
+	/* message out temps */
+	struct ceph_msg *out_msg;        /* sending message (== tail of
+					    out_sent) */
+	bool out_msg_done;
+	struct ceph_msg_pos out_msg_pos;
+
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_kvec_is_msg; /* kvec refers to out_msg */
+	int out_more;        /* there is more data after the kvecs */
+	__le64 out_temp_ack; /* for writing an ack */
+
+	/* message in temps */
+	struct ceph_msg_header in_hdr;
+	struct ceph_msg *in_msg;
+	struct ceph_msg_pos in_msg_pos;
+	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+
+	char in_tag;         /* protocol control byte */
+	int in_base_pos;     /* bytes read */
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	struct delayed_work work;	    /* send|recv work */
+	unsigned long       delay;          /* current delay interval */
+};
+
+
+extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+			  struct ceph_entity_addr *addr,
+			  int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+	struct ceph_entity_addr *myaddr,
+	u32 features, u32 required);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+			  struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+			  struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+				  struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+	kref_get(&msg->kref);
+	return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+	kref_put(&msg->kref, ceph_msg_last_put);
+}
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
new file mode 100644
index 000000000000..545f85917780
--- /dev/null
+++ b/include/linux/ceph/mon_client.h
@@ -0,0 +1,122 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+
+#include "messenger.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 num_mon;
+	struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_generic_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+					 int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+	struct ceph_mon_client *monc;
+	struct delayed_work delayed_work;
+	unsigned long delay;
+	ceph_monc_request_func_t do_request;
+};
+
+/*
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_generic_request {
+	struct kref kref;
+	u64 tid;
+	struct rb_node node;
+	int result;
+	void *buf;
+	int buf_len;
+	struct completion completion;
+	struct ceph_msg *request;  /* original request */
+	struct ceph_msg *reply;    /* and reply */
+};
+
+struct ceph_mon_client {
+	struct ceph_client *client;
+	struct ceph_monmap *monmap;
+
+	struct mutex mutex;
+	struct delayed_work delayed_work;
+
+	struct ceph_auth_client *auth;
+	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
+	int pending_auth;
+
+	bool hunting;
+	int cur_mon;                       /* last monitor i contacted */
+	unsigned long sub_sent, sub_renew_after;
+	struct ceph_connection *con;
+	bool have_fsid;
+
+	/* pending generic requests */
+	struct rb_root generic_request_tree;
+	int num_generic_requests;
+	u64 last_tid;
+
+	/* mds/osd map */
+	int want_mdsmap;
+	int want_next_osdmap; /* 1 = want, 2 = want+asked */
+	u32 have_osdmap, have_mdsmap;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_file;
+#endif
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+				struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+			       struct ceph_statfs *buf);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+				   u32 pool, u64 *snapid);
+
+extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+				   u32 pool, u64 snapid);
+
+#endif
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
new file mode 100644
index 000000000000..a362605f9368
--- /dev/null
+++ b/include/linux/ceph/msgpool.h
@@ -0,0 +1,25 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include <linux/mempool.h>
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+	const char *name;
+	mempool_t *pool;
+	int front_len;          /* preallocated payload size */
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+			     int front_len, int size, bool blocking,
+			     const char *name);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+					 int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
new file mode 100644
index 000000000000..680d3d648cac
--- /dev/null
+++ b/include/linux/ceph/msgr.h
@@ -0,0 +1,175 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT    6789  /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
+	__le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+	__le32 type;
+	__le32 nonce;  /* unique id for process (e.g. pid) */
+	struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+	struct ceph_entity_name name;
+	struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+					  incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+					  with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+					  with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+	__le64 features;     /* supported feature bits */
+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+	__le32 global_seq;   /* count connections initiated by this host */
+	__le32 connect_seq;  /* count connections initiated in this session */
+	__le32 protocol_version;
+	__le32 authorizer_protocol;
+	__le32 authorizer_len;
+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+	__u8 tag;
+	__le64 features;     /* feature bits for this session */
+	__le32 global_seq;
+	__le32 connect_seq;
+	__le32 protocol_version;
+	__le32 authorizer_len;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_inst src, orig_src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_name src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+	__le32 front_crc, middle_crc, data_crc;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+
+
+#endif
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
new file mode 100644
index 000000000000..6c91fb032c39
--- /dev/null
+++ b/include/linux/ceph/osd_client.h
@@ -0,0 +1,234 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+struct ceph_pagelist;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+				     struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+	atomic_t o_ref;
+	struct ceph_osd_client *o_osdc;
+	int o_osd;
+	int o_incarnation;
+	struct rb_node o_node;
+	struct ceph_connection o_con;
+	struct list_head o_requests;
+	struct list_head o_osd_lru;
+	struct ceph_authorizer *o_authorizer;
+	void *o_authorizer_buf, *o_authorizer_reply_buf;
+	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+	unsigned long lru_ttl;
+	int o_marked_for_keepalive;
+	struct list_head o_keepalive_item;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+	u64             r_tid;              /* unique for this client */
+	struct rb_node  r_node;
+	struct list_head r_req_lru_item;
+	struct list_head r_osd_item;
+	struct ceph_osd *r_osd;
+	struct ceph_pg   r_pgid;
+	int              r_pg_osds[CEPH_PG_MAX_SIZE];
+	int              r_num_pg_osds;
+
+	struct ceph_connection *r_con_filling_msg;
+
+	struct ceph_msg  *r_request, *r_reply;
+	int               r_result;
+	int               r_flags;     /* any additional flags for the osd */
+	u32               r_sent;      /* >0 if r_request is sending/sent */
+	int               r_got_reply;
+
+	struct ceph_osd_client *r_osdc;
+	struct kref       r_kref;
+	bool              r_mempool;
+	struct completion r_completion, r_safe_completion;
+	ceph_osdc_callback_t r_callback, r_safe_callback;
+	struct ceph_eversion r_reassert_version;
+	struct list_head  r_unsafe_item;
+
+	struct inode *r_inode;         	      /* for use by callbacks */
+	void *r_priv;			      /* ditto */
+
+	char              r_oid[40];          /* object name */
+	int               r_oid_len;
+	unsigned long     r_stamp;            /* send OR check time */
+	bool              r_resend;           /* msg send failed, needs retry */
+
+	struct ceph_file_layout r_file_layout;
+	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	unsigned          r_num_pages;        /* size of page array (follows) */
+	struct page     **r_pages;            /* pages for data payload */
+	int               r_pages_from_pool;
+	int               r_own_pages;        /* if true, i own page list */
+#ifdef CONFIG_BLOCK
+	struct bio       *r_bio;	      /* instead of pages */
+#endif
+
+	struct ceph_pagelist *r_trail;	      /* trailing part of the data */
+};
+
+struct ceph_osd_client {
+	struct ceph_client     *client;
+
+	struct ceph_osdmap     *osdmap;       /* current map */
+	struct rw_semaphore    map_sem;
+	struct completion      map_waiters;
+	u64                    last_requested_map;
+
+	struct mutex           request_mutex;
+	struct rb_root         osds;          /* osds */
+	struct list_head       osd_lru;       /* idle osds */
+	u64                    timeout_tid;   /* tid of timeout triggering rq */
+	u64                    last_tid;      /* tid of last request */
+	struct rb_root         requests;      /* pending requests */
+	struct list_head       req_lru;	      /* pending requests lru */
+	int                    num_requests;
+	struct delayed_work    timeout_work;
+	struct delayed_work    osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry 	       *debugfs_file;
+#endif
+
+	mempool_t              *req_mempool;
+
+	struct ceph_msgpool	msgpool_op;
+	struct ceph_msgpool	msgpool_op_reply;
+};
+
+struct ceph_osd_req_op {
+	u16 op;           /* CEPH_OSD_OP_* */
+	u32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			u64 offset, length;
+			u64 truncate_size;
+			u32 truncate_seq;
+		} extent;
+		struct {
+			const char *name;
+			u32 name_len;
+			const char  *val;
+			u32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} xattr;
+		struct {
+			const char *class_name;
+			__u8 class_len;
+			const char *method_name;
+			__u8 method_len;
+			__u8 argc;
+			const char *indata;
+			u32 indata_len;
+		} cls;
+		struct {
+			u64 cookie, count;
+		} pgls;
+	        struct {
+		        u64 snapid;
+	        } snap;
+	};
+	u32 payload_len;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+			  struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+				   struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+				 struct ceph_msg *msg);
+
+extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
+					       struct ceph_snap_context *snapc,
+					       struct ceph_osd_req_op *ops,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages,
+					       struct bio *bio);
+
+extern void ceph_osdc_build_request(struct ceph_osd_request *req,
+				    u64 off, u64 *plen,
+				    struct ceph_osd_req_op *src_ops,
+				    struct ceph_snap_context *snapc,
+				    struct timespec *mtime,
+				    const char *oid,
+				    int oid_len);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+				      struct ceph_file_layout *layout,
+				      struct ceph_vino vino,
+				      u64 offset, u64 *len, int op, int flags,
+				      struct ceph_snap_context *snapc,
+				      int do_sync, u32 truncate_seq,
+				      u64 truncate_size,
+				      struct timespec *mtime,
+				      bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+	kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+				   struct ceph_osd_request *req,
+				   bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+				  struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			       struct ceph_vino vino,
+			       struct ceph_file_layout *layout,
+			       u64 off, u64 *plen,
+			       u32 truncate_seq, u64 truncate_size,
+			       struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+				struct ceph_vino vino,
+				struct ceph_file_layout *layout,
+				struct ceph_snap_context *sc,
+				u64 off, u64 len,
+				u32 truncate_seq, u64 truncate_size,
+				struct timespec *mtime,
+				struct page **pages, int nr_pages,
+				int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
new file mode 100644
index 000000000000..ba4c205cbb01
--- /dev/null
+++ b/include/linux/ceph/osdmap.h
@@ -0,0 +1,130 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include <linux/crush/crush.h>
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+	struct rb_node node;
+	int id;
+	struct ceph_pg_pool v;
+	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+	char *name;
+};
+
+struct ceph_pg_mapping {
+	struct rb_node node;
+	struct ceph_pg pgid;
+	int len;
+	int osds[];
+};
+
+struct ceph_osdmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 mkfs_epoch;
+	struct ceph_timespec created, modified;
+
+	u32 flags;         /* CEPH_OSDMAP_* */
+
+	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+	u8 *osd_state;     /* CEPH_OSD_* */
+	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+	struct ceph_entity_addr *osd_addr;
+
+	struct rb_root pg_temp;
+	struct rb_root pg_pools;
+	u32 pool_max;
+
+	/* the CRUSH map specifies the mapping of placement groups to
+	 * the list of osds that store+replicate them. */
+	struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+	((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+	((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+	((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_stripe_unit) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_object_size) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+	return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+						     int osd)
+{
+	if (osd >= map->max_osd)
+		return NULL;
+	return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					    struct ceph_osdmap *map,
+					    struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+					  u64 off, u64 *plen,
+					  u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+				   const char *oid,
+				   struct ceph_file_layout *fl,
+				   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			       int *acting);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+				struct ceph_pg pgid);
+
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+
+#endif
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
new file mode 100644
index 000000000000..cc9327aa1c98
--- /dev/null
+++ b/include/linux/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <linux/list.h>
+
+struct ceph_pagelist {
+	struct list_head head;
+	void *mapped_tail;
+	size_t length;
+	size_t room;
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+	INIT_LIST_HEAD(&pl->head);
+	pl->mapped_tail = NULL;
+	pl->length = 0;
+	pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+	__le64 ev = cpu_to_le64(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+	__le32 ev = cpu_to_le32(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+	__le16 ev = cpu_to_le16(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+	return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+					      char *s, size_t len)
+{
+	int ret = ceph_pagelist_encode_32(pl, len);
+	if (ret)
+		return ret;
+	if (len)
+		return ceph_pagelist_append(pl, s, len);
+	return 0;
+}
+
+#endif
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
new file mode 100644
index 000000000000..6d5247f2e81b
--- /dev/null
+++ b/include/linux/ceph/rados.h
@@ -0,0 +1,405 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+	__u8 type;                /* CEPH_PG_TYPE_* */
+	__u8 size;                /* number of osds in each pg */
+	__u8 crush_ruleset;       /* crush placement rule */
+	__u8 object_hash;         /* hash mapping object name to ps */
+	__le32 pg_num, pgp_num;   /* number of pg's */
+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
+	__le32 last_change;       /* most recent epoch changed */
+	__le64 snap_seq;          /* seq for per-pool snapshot */
+	__le32 snap_epoch;        /* epoch of last snap */
+	__le32 num_snaps;
+	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+	__le64 auid;               /* who owns the pg */
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+	/** data **/
+	/* read */
+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+	/* fancy read */
+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+	/* write */
+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+	/* fancy write */
+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+	CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
+
+	/** attrs **/
+	/* read */
+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
+
+	/* write */
+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+	/** subop **/
+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+	/** lock **/
+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+	/** exec **/
+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+	/** pg **/
+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS = 256,
+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+	CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/* xattr comparison */
+enum {
+	CEPH_OSD_CMPXATTR_OP_NOP = 0,
+	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+	CEPH_OSD_CMPXATTR_OP_NE  = 2,
+	CEPH_OSD_CMPXATTR_OP_GT  = 3,
+	CEPH_OSD_CMPXATTR_OP_GTE = 4,
+	CEPH_OSD_CMPXATTR_OP_LT  = 5,
+	CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+	CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 cookie, count;
+		} __attribute__ ((packed)) pgls;
+	        struct {
+		        __le64 snapid;
+	        } __attribute__ ((packed)) snap;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+	__le32 client_inc;                 /* client incarnation */
+	struct ceph_object_layout layout;  /* pgid */
+	__le32 osdmap_epoch;               /* client's osdmap epoch */
+
+	__le32 flags;
+
+	struct ceph_timespec mtime;        /* for mutations only */
+	struct ceph_eversion reassert_version; /* if we are replaying op */
+
+	__le32 object_len;     /* length of object name */
+
+	__le64 snapid;         /* snapid to read */
+	__le64 snap_seq;       /* writer's snap context */
+	__le32 num_snaps;
+
+	__le16 num_ops;
+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/include/linux/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+	u64 ino;
+	u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+	int count;
+};
+
+
+#endif
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
new file mode 100644
index 000000000000..97e435b191f4
--- /dev/null
+++ b/include/linux/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef CEPH_CRUSH_CRUSH_H
+#define CEPH_CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+	__u32 op;
+	__s32 arg1;
+	__s32 arg2;
+};
+
+/* step op codes */
+enum {
+	CRUSH_RULE_NOOP = 0,
+	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+				      /* arg2 = type */
+	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+	CRUSH_RULE_EMIT = 4,          /* no args */
+	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+	__u8 ruleset;
+	__u8 type;
+	__u8 min_size;
+	__u8 max_size;
+};
+
+struct crush_rule {
+	__u32 len;
+	struct crush_rule_mask mask;
+	struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+			      (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+	CRUSH_BUCKET_UNIFORM = 1,
+	CRUSH_BUCKET_LIST = 2,
+	CRUSH_BUCKET_TREE = 3,
+	CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+	__s32 id;        /* this'll be negative */
+	__u16 type;      /* non-zero; type=0 is reserved for devices */
+	__u8 alg;        /* one of CRUSH_BUCKET_* */
+	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+	__u32 weight;    /* 16-bit fixed point */
+	__u32 size;      /* num items */
+	__s32 *items;
+
+	/*
+	 * cached random permutation: used for uniform bucket and for
+	 * the linear search fallback for the other bucket types.
+	 */
+	__u32 perm_x;  /* @x for which *perm is defined */
+	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
+	__u32 *perm;
+};
+
+struct crush_bucket_uniform {
+	struct crush_bucket h;
+	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+	struct crush_bucket h;
+	__u32 *item_weights;  /* 16-bit fixed point */
+	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+				 of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+				   actual items */
+	__u8 num_nodes;
+	__u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+	struct crush_bucket h;
+	__u32 *item_weights;   /* 16-bit fixed point */
+	__u32 *straws;         /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+	struct crush_bucket **buckets;
+	struct crush_rule **rules;
+
+	/*
+	 * Parent pointers to identify the parent bucket a device or
+	 * bucket in the hierarchy.  If an item appears more than
+	 * once, this is the _last_ time it appeared (where buckets
+	 * are processed in bucket id order, from -1 on down to
+	 * -max_buckets.
+	 */
+	__u32 *bucket_parents;
+	__u32 *device_parents;
+
+	__s32 max_buckets;
+	__u32 max_rules;
+	__s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h
new file mode 100644
index 000000000000..91e884230d5d
--- /dev/null
+++ b/include/linux/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef CEPH_CRUSH_HASH_H
+#define CEPH_CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+			    __u32 e);
+
+#endif
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
new file mode 100644
index 000000000000..c46b99c18bb0
--- /dev/null
+++ b/include/linux/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef CEPH_CRUSH_MAPPER_H
+#define CEPH_CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+			 int ruleno,
+			 int x, int *result, int result_max,
+			 int forcefeed,    /* -1 for none */
+			 __u32 *weights);
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index e926884c1675..55fd82e9ffd9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
 source "net/rfkill/Kconfig"
 source "net/9p/Kconfig"
 source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
 
 
 endif   # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1b..6b7bfd7f1416 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
 obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
+obj-$(CONFIG_CEPH_LIB)		+= ceph/
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ad424049b0cf
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
+config CEPH_LIB
+        tristate "Ceph core library (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Choose Y or M here to include cephlib, which provides the
+	  common functionality to both the Ceph filesystem and
+	  to the rados block device (rbd).
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_LIB
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This increases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..aab1cabb8035
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+	debugfs.o \
+	auth.o auth_none.o \
+	crypto.o armor.o \
+	auth_x.o \
+	ceph_fs.o ceph_strings.o ceph_hash.o \
+	pagevec.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
+
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 000000000000..eb2a666b0be7
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+')
+		return 62;
+	if (c == '/')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+	int line = 0;
+
+	while (src < end) {
+		unsigned char a, b, c;
+
+		a = *src++;
+		*dst++ = encode_bits(a >> 2);
+		if (src < end) {
+			b = *src++;
+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+			if (src < end) {
+				c = *src++;
+				*dst++ = encode_bits(((b & 15) << 2) |
+						     (c >> 6));
+				*dst++ = encode_bits(c & 63);
+			} else {
+				*dst++ = encode_bits((b & 15) << 2);
+				*dst++ = '=';
+			}
+		} else {
+			*dst++ = encode_bits(((a & 3) << 4));
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		olen += 4;
+		line += 4;
+		if (line == 64) {
+			line = 0;
+			*(dst++) = '\n';
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src < end && src[0] == '\n')
+			src++;
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		*dst++ = (a << 2) | (b >> 4);
+		if (src[2] == '=')
+			return olen + 1;
+		*dst++ = ((b & 15) << 4) | (c >> 2);
+		if (src[3] == '=')
+			return olen + 2;
+		*dst++ = ((c & 3) << 6) | d;
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 000000000000..549c1f43e1d5
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+	CEPH_AUTH_NONE,
+	CEPH_AUTH_CEPHX
+};
+
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+	switch (protocol) {
+	case CEPH_AUTH_NONE:
+		return ceph_auth_none_init(ac);
+	case CEPH_AUTH_CEPHX:
+		return ceph_x_init(ac);
+	default:
+		return -ENOENT;
+	}
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+	struct ceph_auth_client *ac;
+	int ret;
+
+	dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+	ret = -ENOMEM;
+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		goto out;
+
+	ac->negotiating = true;
+	if (name)
+		ac->name = name;
+	else
+		ac->name = CEPH_AUTH_NAME_DEFAULT;
+	dout("auth_init name %s secret %s\n", ac->name, secret);
+	ac->secret = secret;
+	return ac;
+
+out:
+	return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+	dout("auth_destroy %p\n", ac);
+	if (ac->ops)
+		ac->ops->destroy(ac);
+	kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+	dout("auth_reset %p\n", ac);
+	if (ac->ops && !ac->negotiating)
+		ac->ops->reset(ac);
+	ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+	int len = strlen(name);
+
+	if (*p + 2*sizeof(u32) + len > end)
+		return -ERANGE;
+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_32(p, len);
+	ceph_encode_copy(p, name, len);
+	return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+	struct ceph_mon_request_header *monhdr = buf;
+	void *p = monhdr + 1, *end = buf + len, *lenp;
+	int i, num;
+	int ret;
+
+	dout("auth_build_hello\n");
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+	lenp = p;
+	p += sizeof(u32);
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	ceph_encode_8(&p, 1);
+	num = ARRAY_SIZE(supported_protocols);
+	ceph_encode_32(&p, num);
+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
+	for (i = 0; i < num; i++)
+		ceph_encode_32(&p, supported_protocols[i]);
+
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+	ceph_decode_need(&p, end, sizeof(u64), bad);
+	ceph_encode_64(&p, ac->global_id);
+
+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+	return p - buf;
+
+bad:
+	return -ERANGE;
+}
+
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
+				   void *msg_buf, size_t msg_len)
+{
+	struct ceph_mon_request_header *monhdr = msg_buf;
+	void *p = monhdr + 1;
+	void *end = msg_buf + msg_len;
+	int ret;
+
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, ac->protocol);
+
+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+	if (ret < 0) {
+		pr_err("error %d building auth method %s request\n", ret,
+		       ac->ops->name);
+		return ret;
+	}
+	dout(" built request %d bytes\n", ret);
+	ceph_encode_32(&p, ret);
+	return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+			   void *buf, size_t len,
+			   void *reply_buf, size_t reply_len)
+{
+	void *p = buf;
+	void *end = buf + len;
+	int protocol;
+	s32 result;
+	u64 global_id;
+	void *payload, *payload_end;
+	int payload_len;
+	char *result_msg;
+	int result_msg_len;
+	int ret = -EINVAL;
+
+	dout("handle_auth_reply %p %p\n", p, end);
+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+	protocol = ceph_decode_32(&p);
+	result = ceph_decode_32(&p);
+	global_id = ceph_decode_64(&p);
+	payload_len = ceph_decode_32(&p);
+	payload = p;
+	p += payload_len;
+	ceph_decode_need(&p, end, sizeof(u32), bad);
+	result_msg_len = ceph_decode_32(&p);
+	result_msg = p;
+	p += result_msg_len;
+	if (p != end)
+		goto bad;
+
+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+	     result_msg, global_id, payload_len);
+
+	payload_end = payload + payload_len;
+
+	if (global_id && ac->global_id != global_id) {
+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+		ac->global_id = global_id;
+	}
+
+	if (ac->negotiating) {
+		/* server does not support our protocols? */
+		if (!protocol && result < 0) {
+			ret = result;
+			goto out;
+		}
+		/* set up (new) protocol handler? */
+		if (ac->protocol && ac->protocol != protocol) {
+			ac->ops->destroy(ac);
+			ac->protocol = 0;
+			ac->ops = NULL;
+		}
+		if (ac->protocol != protocol) {
+			ret = ceph_auth_init_protocol(ac, protocol);
+			if (ret) {
+				pr_err("error %d on auth protocol %d init\n",
+				       ret, protocol);
+				goto out;
+			}
+		}
+
+		ac->negotiating = false;
+	}
+
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	if (ret == -EAGAIN) {
+		return ceph_build_auth_request(ac, reply_buf, reply_len);
+	} else if (ret) {
+		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+		return ret;
+	}
+	return 0;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+out:
+	return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len)
+{
+	if (!ac->protocol)
+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
+	BUG_ON(!ac->ops);
+	if (ac->ops->should_authenticate(ac))
+		return ceph_build_auth_request(ac, msg_buf, msg_len);
+	return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+	if (!ac->ops)
+		return 0;
+	return ac->ops->is_authenticated(ac);
+}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 000000000000..214c2bb43d62
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+			void *buf, void *end)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = false;
+	return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_auth_none_info *ai = ac->private;
+	struct ceph_none_authorizer *au = &ai->au;
+	void *p, *end;
+	int ret;
+
+	if (!ai->built_authorizer) {
+		p = au->buf;
+		end = p + sizeof(au->buf);
+		ceph_encode_8(&p, 1);
+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+		if (ret < 0)
+			goto bad;
+		ceph_decode_need(&p, end, sizeof(u64), bad2);
+		ceph_encode_64(&p, ac->global_id);
+		au->buf_len = p - (void *)au->buf;
+		ai->built_authorizer = true;
+		dout("built authorizer len %d\n", au->buf_len);
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf;
+	*len = au->buf_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+
+bad2:
+	ret = -ERANGE;
+bad:
+	return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	/* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.name = "none",
+	.reset = reset,
+	.destroy = destroy,
+	.is_authenticated = is_authenticated,
+	.should_authenticate = should_authenticate,
+	.handle_reply = handle_reply,
+	.create_authorizer = ceph_auth_none_create_authorizer,
+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi;
+
+	dout("ceph_auth_none_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+
+	ac->protocol = CEPH_AUTH_NONE;
+	ac->private = xi;
+	ac->ops = &ceph_auth_none_ops;
+	return 0;
+}
+
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 000000000000..ed7d088b1bc9
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+	char buf[128];
+	int buf_len;
+	char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+	bool starting;
+	bool built_authorizer;
+	struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 000000000000..7fd5dfcf6e18
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+#define TEMP_TICKET_BUF_LEN	256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+			  void *ibuf, int ilen, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head = {
+		.struct_v = 1,
+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+	};
+	size_t len = olen - sizeof(u32);
+	int ret;
+
+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+			    &head, sizeof(head), ibuf, ilen);
+	if (ret)
+		return ret;
+	ceph_encode_32(&obuf, len);
+	return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+			  void **p, void *end, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head;
+	size_t head_len = sizeof(head);
+	int len, ret;
+
+	len = ceph_decode_32(p);
+	if (*p + len > end)
+		return -EINVAL;
+
+	dout("ceph_x_decrypt len %d\n", len);
+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+			    *p, len);
+	if (ret)
+		return ret;
+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+		return -EPERM;
+	*p += len;
+	return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+	struct ceph_x_ticket_handler *th;
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+	while (*p) {
+		parent = *p;
+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+		if (service < th->service)
+			p = &(*p)->rb_left;
+		else if (service > th->service)
+			p = &(*p)->rb_right;
+		else
+			return th;
+	}
+
+	/* add it */
+	th = kzalloc(sizeof(*th), GFP_NOFS);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+	th->service = service;
+	rb_link_node(&th->node, parent, p);
+	rb_insert_color(&th->node, &xi->ticket_handlers);
+	return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+				  struct ceph_x_ticket_handler *th)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("remove_ticket_handler %p %d\n", th, th->service);
+	rb_erase(&th->node, &xi->ticket_handlers);
+	ceph_crypto_key_destroy(&th->session_key);
+	if (th->ticket_blob)
+		ceph_buffer_put(th->ticket_blob);
+	kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+				    struct ceph_crypto_key *secret,
+				    void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int num;
+	void *p = buf;
+	int ret;
+	char *dbuf;
+	char *ticket_buf;
+	u8 reply_struct_v;
+
+	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!dbuf)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!ticket_buf)
+		goto out_dbuf;
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	reply_struct_v = ceph_decode_8(&p);
+	if (reply_struct_v != 1)
+		goto bad;
+	num = ceph_decode_32(&p);
+	dout("%d tickets\n", num);
+	while (num--) {
+		int type;
+		u8 tkt_struct_v, blob_struct_v;
+		struct ceph_x_ticket_handler *th;
+		void *dp, *dend;
+		int dlen;
+		char is_enc;
+		struct timespec validity;
+		struct ceph_crypto_key old_key;
+		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
+		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
+
+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+		type = ceph_decode_32(&p);
+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+		tkt_struct_v = ceph_decode_8(&p);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		th = get_ticket_handler(ac, type);
+		if (IS_ERR(th)) {
+			ret = PTR_ERR(th);
+			goto out;
+		}
+
+		/* blob for me */
+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+				      TEMP_TICKET_BUF_LEN);
+		if (dlen <= 0) {
+			ret = dlen;
+			goto out;
+		}
+		dout(" decrypted %d bytes\n", dlen);
+		dend = dbuf + dlen;
+		dp = dbuf;
+
+		tkt_struct_v = ceph_decode_8(&dp);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		memcpy(&old_key, &th->session_key, sizeof(old_key));
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+		if (ret)
+			goto out;
+
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
+
+		/* ticket blob for service */
+		ceph_decode_8_safe(&p, end, is_enc, bad);
+		tp = ticket_buf;
+		if (is_enc) {
+			/* encrypted */
+			dout(" encrypted ticket\n");
+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+					      TEMP_TICKET_BUF_LEN);
+			if (dlen < 0) {
+				ret = dlen;
+				goto out;
+			}
+			dlen = ceph_decode_32(&tp);
+		} else {
+			/* unencrypted */
+			ceph_decode_32_safe(&p, end, dlen, bad);
+			ceph_decode_need(&p, end, dlen, bad);
+			ceph_decode_copy(&p, ticket_buf, dlen);
+		}
+		tpend = tp + dlen;
+		dout(" ticket blob is %d bytes\n", dlen);
+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+		blob_struct_v = ceph_decode_8(&tp);
+		new_secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+		if (ret)
+			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
+		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+		     type, ceph_entity_type_name(type), th->secret_id,
+		     (int)th->ticket_blob->vec.iov_len);
+		xi->have_keys |= th->service;
+	}
+
+	ret = 0;
+out:
+	kfree(ticket_buf);
+out_dbuf:
+	kfree(dbuf);
+	return ret;
+
+bad:
+	ret = -EINVAL;
+	goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+				   struct ceph_x_ticket_handler *th,
+				   struct ceph_x_authorizer *au)
+{
+	int maxlen;
+	struct ceph_x_authorize_a *msg_a;
+	struct ceph_x_authorize_b msg_b;
+	void *p, *end;
+	int ret;
+	int ticket_blob_len =
+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+	dout("build_authorizer for %s %p\n",
+	     ceph_entity_type_name(th->service), au);
+
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
+		ceph_buffer_put(au->buf);
+		au->buf = NULL;
+	}
+	if (!au->buf) {
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+		if (!au->buf)
+			return -ENOMEM;
+	}
+	au->service = th->service;
+
+	msg_a = au->buf->vec.iov_base;
+	msg_a->struct_v = 1;
+	msg_a->global_id = cpu_to_le64(ac->global_id);
+	msg_a->service_id = cpu_to_le32(th->service);
+	msg_a->ticket_blob.struct_v = 1;
+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+	if (ticket_blob_len) {
+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+		       th->ticket_blob->vec.iov_len);
+	}
+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+	p = msg_a + 1;
+	p += ticket_blob_len;
+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+	get_random_bytes(&au->nonce, sizeof(au->nonce));
+	msg_b.struct_v = 1;
+	msg_b.nonce = cpu_to_le64(au->nonce);
+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+			     p, end - p);
+	if (ret < 0)
+		goto out_buf;
+	p += ret;
+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
+	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
+	return 0;
+
+out_buf:
+	ceph_buffer_put(au->buf);
+	au->buf = NULL;
+	return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+				void **p, void *end)
+{
+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, th->secret_id);
+	if (th->ticket_blob) {
+		const char *buf = th->ticket_blob->vec.iov_base;
+		u32 len = th->ticket_blob->vec.iov_len;
+
+		ceph_encode_32_safe(p, end, len, bad);
+		ceph_encode_copy_safe(p, end, buf, len, bad);
+	} else {
+		ceph_encode_32_safe(p, end, 0, bad);
+	}
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+	int want = ac->want_keys;
+	struct ceph_x_info *xi = ac->private;
+	int service;
+
+	*pneed = ac->want_keys & ~(xi->have_keys);
+
+	for (service = 1; service <= want; service <<= 1) {
+		struct ceph_x_ticket_handler *th;
+
+		if (!(ac->want_keys & service))
+			continue;
+
+		if (*pneed & service)
+			continue;
+
+		th = get_ticket_handler(ac, service);
+
+		if (IS_ERR(th)) {
+			*pneed |= service;
+			continue;
+		}
+
+		if (get_seconds() >= th->renew_after)
+			*pneed |= service;
+		if (get_seconds() >= th->expires)
+			xi->have_keys &= ~service;
+	}
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+				void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+	struct ceph_x_request_header *head = buf;
+	int ret;
+	struct ceph_x_ticket_handler *th =
+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	ceph_x_validate_tickets(ac, &need);
+
+	dout("build_request want %x have %x need %x\n",
+	     ac->want_keys, xi->have_keys, need);
+
+	if (need & CEPH_ENTITY_TYPE_AUTH) {
+		struct ceph_x_authenticate *auth = (void *)(head + 1);
+		void *p = auth + 1;
+		struct ceph_x_challenge_blob tmp;
+		char tmp_enc[40];
+		u64 *u;
+
+		if (p > end)
+			return -ERANGE;
+
+		dout(" get_auth_session_key\n");
+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+		/* encrypt and hash */
+		get_random_bytes(&auth->client_challenge, sizeof(u64));
+		tmp.client_challenge = auth->client_challenge;
+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+				     tmp_enc, sizeof(tmp_enc));
+		if (ret < 0)
+			return ret;
+
+		auth->struct_v = 1;
+		auth->key = 0;
+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+			auth->key ^= *(__le64 *)u;
+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+		     le64_to_cpu(auth->key));
+
+		/* now encode the old ticket if exists */
+		ret = ceph_x_encode_ticket(th, &p, end);
+		if (ret < 0)
+			return ret;
+
+		return p - buf;
+	}
+
+	if (need) {
+		void *p = head + 1;
+		struct ceph_x_service_ticket_request *req;
+
+		if (p > end)
+			return -ERANGE;
+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+		if (ret)
+			return ret;
+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+				 xi->auth_authorizer.buf->vec.iov_len);
+
+		req = p;
+		req->keys = cpu_to_le32(need);
+		p += sizeof(*req);
+		return p - buf;
+	}
+
+	return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+			       void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_reply_header *head = buf;
+	struct ceph_x_ticket_handler *th;
+	int len = end - buf;
+	int op;
+	int ret;
+
+	if (result)
+		return result;  /* XXX hmm? */
+
+	if (xi->starting) {
+		/* it's a hello */
+		struct ceph_x_server_challenge *sc = buf;
+
+		if (len != sizeof(*sc))
+			return -EINVAL;
+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
+		dout("handle_reply got server challenge %llx\n",
+		     xi->server_challenge);
+		xi->starting = false;
+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+		return -EAGAIN;
+	}
+
+	op = le16_to_cpu(head->op);
+	result = le32_to_cpu(head->result);
+	dout("handle_reply op %d result %d\n", op, result);
+	switch (op) {
+	case CEPHX_GET_AUTH_SESSION_KEY:
+		/* verify auth key */
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+					       buf + sizeof(*head), end);
+		break;
+
+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+		if (IS_ERR(th))
+			return PTR_ERR(th);
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       buf + sizeof(*head), end);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+	if (ac->want_keys == xi->have_keys)
+		return 0;
+	return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = kzalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	ret = ceph_x_build_authorizer(ac, th, au);
+	if (ret) {
+		kfree(au);
+		return ret;
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf->vec.iov_base;
+	*len = au->buf->vec.iov_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a, size_t len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	struct ceph_x_ticket_handler *th;
+	int ret = 0;
+	struct ceph_x_authorize_reply reply;
+	void *p = au->reply_buf;
+	void *end = p + sizeof(au->reply_buf);
+
+	th = get_ticket_handler(ac, au->service);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(reply))
+		return -EPERM;
+
+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+		ret = -EPERM;
+	else
+		ret = 0;
+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+	return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_buffer_put(au->buf);
+	kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("reset\n");
+	xi->starting = true;
+	xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *p;
+
+	dout("ceph_x_destroy %p\n", ac);
+	ceph_crypto_key_destroy(&xi->secret);
+
+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+		struct ceph_x_ticket_handler *th =
+			rb_entry(p, struct ceph_x_ticket_handler, node);
+		remove_ticket_handler(ac, th);
+	}
+
+	if (xi->auth_authorizer.buf)
+		ceph_buffer_put(xi->auth_authorizer.buf);
+
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+				   int peer_type)
+{
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (!IS_ERR(th))
+		remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+	.name = "x",
+	.is_authenticated = ceph_x_is_authenticated,
+	.should_authenticate = ceph_x_should_authenticate,
+	.build_request = ceph_x_build_request,
+	.handle_reply = ceph_x_handle_reply,
+	.create_authorizer = ceph_x_create_authorizer,
+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+	.destroy_authorizer = ceph_x_destroy_authorizer,
+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
+	.reset =  ceph_x_reset,
+	.destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi;
+	int ret;
+
+	dout("ceph_x_init %p\n", ac);
+	ret = -ENOMEM;
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		goto out;
+
+	ret = -EINVAL;
+	if (!ac->secret) {
+		pr_err("no secret set (for auth_x protocol)\n");
+		goto out_nomem;
+	}
+
+	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+	if (ret)
+		goto out_nomem;
+
+	xi->starting = true;
+	xi->ticket_handlers = RB_ROOT;
+
+	ac->protocol = CEPH_AUTH_CEPHX;
+	ac->private = xi;
+	ac->ops = &ceph_x_ops;
+	return 0;
+
+out_nomem:
+	kfree(xi);
+out:
+	return ret;
+}
+
+
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 000000000000..e02da7a5c5a1
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+	struct rb_node node;
+	unsigned service;
+
+	struct ceph_crypto_key session_key;
+	struct ceph_timespec validity;
+
+	u64 secret_id;
+	struct ceph_buffer *ticket_blob;
+
+	unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+	struct ceph_buffer *buf;
+	unsigned service;
+	u64 nonce;
+	char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+	struct ceph_crypto_key secret;
+
+	bool starting;
+	u64 server_challenge;
+
+	unsigned have_keys;
+	struct rb_root ticket_handlers;
+
+	struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+	__u8 struct_v;
+	__le64 secret_id;
+	__le32 blob_len;
+	char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+	__le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+	__le16 op;
+	__le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+	__u8 struct_v;
+	__le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+	__u8 struct_v;
+	__le64 client_challenge;
+	__le64 key;
+	/* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+	__u8 struct_v;
+	__le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+	__le64 server_challenge;
+	__le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+	__u8 struct_v;
+	__le64 global_id;
+	__le32 service_id;
+	struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+	__u8 struct_v;
+	__le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+	__u8 struct_v;
+	__le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+	__u8 struct_v;
+	__le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 000000000000..53d8abfa25d5
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+	struct ceph_buffer *b;
+
+	b = kmalloc(sizeof(*b), gfp);
+	if (!b)
+		return NULL;
+
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		if (!b->vec.iov_base) {
+			kfree(b);
+			return NULL;
+		}
+		b->is_vmalloc = true;
+	}
+
+	kref_init(&b->kref);
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	dout("buffer_new %p\n", b);
+	return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+	dout("buffer_release %p\n", b);
+	if (b->vec.iov_base) {
+		if (b->is_vmalloc)
+			vfree(b->vec.iov_base);
+		else
+			kfree(b->vec.iov_base);
+	}
+	kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+	size_t len;
+
+	ceph_decode_need(p, end, sizeof(u32), bad);
+	len = ceph_decode_32(p);
+	dout("decode_buffer len %d\n", (int)len);
+	ceph_decode_need(p, end, len, bad);
+	*b = ceph_buffer_new(len, GFP_NOFS);
+	if (!*b)
+		return -ENOMEM;
+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
+	return 0;
+bad:
+	return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..f6f2eebc0767
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	default: return "unknown";
+	}
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			pr_err("bad fsid, had %pU got %pU",
+			       &client->fsid, fsid);
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+			 struct ceph_client *client)
+{
+	struct ceph_options *opt1 = new_opt;
+	struct ceph_options *opt2 = client->options;
+	int ofs = offsetof(struct ceph_options, mon_addr);
+	int i;
+	int ret;
+
+	ret = memcmp(opt1, opt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->name, opt2->name);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->secret, opt2->secret);
+	if (ret)
+		return ret;
+
+	/* any matching mon ip implies a match */
+	for (i = 0; i < opt1->num_mon; i++) {
+		if (ceph_monmap_contains(client->monc.monmap,
+				 &opt1->mon_addr[i]))
+			return 0;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+	int i = 0;
+	char tmp[3];
+	int err = -EINVAL;
+	int d;
+
+	dout("parse_fsid '%s'\n", str);
+	tmp[2] = 0;
+	while (*str && i < 16) {
+		if (ispunct(*str)) {
+			str++;
+			continue;
+		}
+		if (!isxdigit(str[0]) || !isxdigit(str[1]))
+			break;
+		tmp[0] = str[0];
+		tmp[1] = str[1];
+		if (sscanf(tmp, "%x", &d) < 1)
+			break;
+		fsid->fsid[i] = d & 0xff;
+		i++;
+		str += 2;
+	}
+
+	if (i == 16)
+		err = 0;
+	dout("parse_fsid ret %d got fsid %pU", err, fsid);
+	return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
+	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
+	Opt_last_int,
+	/* int args above */
+	Opt_fsid,
+	Opt_name,
+	Opt_secret,
+	Opt_ip,
+	Opt_last_string,
+	/* string args above */
+	Opt_noshare,
+	Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	/* int args above */
+	{Opt_fsid, "fsid=%s"},
+	{Opt_name, "name=%s"},
+	{Opt_secret, "secret=%s"},
+	{Opt_ip, "ip=%s"},
+	/* string args above */
+	{Opt_noshare, "noshare"},
+	{Opt_nocrc, "nocrc"},
+	{-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+	dout("destroy_options %p\n", opt);
+	kfree(opt->name);
+	kfree(opt->secret);
+	kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+		       const char *dev_name, const char *dev_name_end,
+		       int (*parse_extra_token)(char *c, void *private),
+		       void *private)
+{
+	struct ceph_options *opt;
+	const char *c;
+	int err = -ENOMEM;
+	substring_t argstr[MAX_OPT_ARGS];
+
+	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+	if (!opt)
+		return err;
+	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+				GFP_KERNEL);
+	if (!opt->mon_addr)
+		goto out;
+
+	dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+	     dev_name);
+
+	/* start with defaults */
+	opt->flags = CEPH_OPT_DEFAULT;
+	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+
+	/* get mon ip(s) */
+	/* ip1[:port1][,ip2[:port2]...] */
+	err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+			     CEPH_MAX_MON, &opt->num_mon);
+	if (err < 0)
+		goto out;
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		err = -EINVAL;
+		token = match_token((char *)c, opt_tokens, argstr);
+		if (token < 0) {
+			/* extra? */
+			err = parse_extra_token((char *)c, private);
+			if (err < 0) {
+				pr_err("bad option at '%s'\n", c);
+				goto out;
+			}
+			continue;
+		}
+		if (token < Opt_last_int) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
+		}
+		switch (token) {
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &opt->my_addr,
+					     1, NULL);
+			if (err < 0)
+				goto out;
+			opt->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_fsid:
+			err = parse_fsid(argstr[0].from, &opt->fsid);
+			if (err == 0)
+				opt->flags |= CEPH_OPT_FSID;
+			break;
+		case Opt_name:
+			opt->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+			opt->secret = kstrndup(argstr[0].from,
+						argstr[0].to-argstr[0].from,
+						GFP_KERNEL);
+			break;
+
+			/* misc */
+		case Opt_osdtimeout:
+			opt->osd_timeout = intval;
+			break;
+		case Opt_osdkeepalivetimeout:
+			opt->osd_keepalive_timeout = intval;
+			break;
+		case Opt_osd_idle_ttl:
+			opt->osd_idle_ttl = intval;
+			break;
+		case Opt_mount_timeout:
+			opt->mount_timeout = intval;
+			break;
+
+		case Opt_noshare:
+			opt->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_nocrc:
+			opt->flags |= CEPH_OPT_NOCRC;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+
+	/* success */
+	*popt = opt;
+	return 0;
+
+out:
+	ceph_destroy_options(opt);
+	return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+	return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	client->private = private;
+	client->options = opt;
+
+	mutex_init(&client->mount_mutex);
+	init_waitqueue_head(&client->auth_wq);
+	client->auth_err = 0;
+
+	client->extra_mon_dispatch = NULL;
+	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+	client->msgr = NULL;
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+
+	return client;
+
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_osdc_stop(&client->osdc);
+
+	/*
+	 * make sure mds and osd connections close out before destroying
+	 * the auth module, which is needed to free those connections'
+	 * ceph_authorizers.
+	 */
+	ceph_msgr_flush();
+
+	ceph_monc_stop(&client->monc);
+
+	ceph_debugfs_client_cleanup(client);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+
+	ceph_destroy_options(client->options);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->options->mount_timeout * HZ;
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->options->my_addr;
+		client->msgr = ceph_messenger_create(myaddr,
+					client->supported_features,
+					client->required_features);
+		if (IS_ERR(client->msgr)) {
+			client->msgr = NULL;
+			return PTR_ERR(client->msgr);
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* open session, and wait for mon and osd maps */
+	err = ceph_monc_open_session(&client->monc);
+	if (err < 0)
+		return err;
+
+	while (!have_mon_and_osd_map(client)) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			return err;
+
+		/* wait */
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->auth_wq,
+			have_mon_and_osd_map(client) || (client->auth_err < 0),
+			timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			return err;
+		if (client->auth_err < 0)
+			return client->auth_err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+	int ret;
+	unsigned long started = jiffies;  /* note the start time */
+
+	dout("open_session start\n");
+	mutex_lock(&client->mount_mutex);
+
+	ret = __ceph_open_session(client, started);
+
+	mutex_unlock(&client->mount_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+	return 0;
+
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+	dout("exit_ceph_lib\n");
+	ceph_msgr_exit();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 000000000000..a3a3a31d3c37
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	__u32 os = le32_to_cpu(layout->fl_object_size);
+
+	/* stripe unit, object size must be non-zero, 64k increment */
+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	/* object size must be a multiple of stripe unit */
+	if (os < su || os % su)
+		return 0;
+	/* stripe count must be non-zero */
+	if (!sc)
+		return 0;
+	return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+	int mode;
+
+#ifdef O_DIRECTORY  /* fixme */
+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+#endif
+	if ((flags & O_APPEND) == O_APPEND)
+		flags |= O_WRONLY;
+
+	if ((flags & O_ACCMODE) == O_RDWR)
+		mode = CEPH_FILE_MODE_RDWR;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		mode = CEPH_FILE_MODE_WR;
+	else
+		mode = CEPH_FILE_MODE_RD;
+
+#ifdef O_LAZY
+	if (flags & O_LAZY)
+		mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+	return mode;
+}
+EXPORT_SYMBOL(ceph_flags_to_mode);
+
+int ceph_caps_for_mode(int mode)
+{
+	int caps = CEPH_CAP_PIN;
+
+	if (mode & CEPH_FILE_MODE_RD)
+		caps |= CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	if (mode & CEPH_FILE_MODE_WR)
+		caps |= CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	if (mode & CEPH_FILE_MODE_LAZY)
+		caps |= CEPH_CAP_FILE_LAZYIO;
+
+	return caps;
+}
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 000000000000..815ef8826796
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include <linux/ceph/types.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned long hash = 0;
+	unsigned char c;
+
+	while (length--) {
+		c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..3fbda04de29c
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+	case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 000000000000..d6ebb13a18a4
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include <linux/crush/crush.h>
+
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+	if (p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		if (p & 1)
+			return ((struct crush_bucket_tree *)b)->node_weights[p];
+		return 0;
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+	int i, b, c;
+
+	for (b = 0; b < map->max_buckets; b++) {
+		if (map->buckets[b] == NULL)
+			continue;
+		for (i = 0; i < map->buckets[b]->size; i++) {
+			c = map->buckets[b]->items[i];
+			BUG_ON(c >= map->max_devices ||
+			       c < -map->max_buckets);
+			if (c >= 0)
+				map->device_parents[c] = map->buckets[b]->id;
+			else
+				map->bucket_parents[-1-c] = map->buckets[b]->id;
+		}
+	}
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	int b;
+
+	/* buckets */
+	if (map->buckets) {
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		for (b = 0; b < map->max_rules; b++)
+			kfree(map->rules[b]);
+		kfree(map->rules);
+	}
+
+	kfree(map->bucket_parents);
+	kfree(map->device_parents);
+	kfree(map);
+}
+
+
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 000000000000..5bb63e37a8a1
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 000000000000..42599e31dcad
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+	int i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+			      int x, int r)
+{
+	unsigned pr = r % bucket->size;
+	unsigned i, s;
+
+	/* start a new permutation if @x has changed */
+	if (bucket->perm_x != x || bucket->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		bucket->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+				bucket->size;
+			bucket->perm[0] = s;
+			bucket->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm_n = 0;
+	} else if (bucket->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm[bucket->perm[0]] = 0;
+		bucket->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < bucket->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+	while (bucket->perm_n <= pr) {
+		unsigned p = bucket->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned t = bucket->perm[p + i];
+				bucket->perm[p + i] = bucket->perm[p];
+				bucket->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		bucket->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+	s = bucket->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+				 int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+					 r, bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i])
+			return bucket->h.items[i];
+	}
+
+	BUG_ON(1);
+	return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n, l;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	int i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+					  x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose((struct crush_bucket_straw *)in,
+					   x, r);
+	default:
+		BUG_ON(1);
+		return in->items[0];
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+	if (weight[item] >= 0x10000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+			struct crush_bucket *bucket,
+			__u32 *weight,
+			int x, int numrep, int type,
+			int *out, int outpos,
+			int firstn, int recurse_to_leaf,
+			int *out2)
+{
+	int rep;
+	int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide, reject;
+	const int orig_tries = 5; /* attempts before we fall back to search */
+
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep);
+
+	for (rep = outpos; rep < numrep; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;               /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				collide = 0;
+				retry_bucket = 0;
+				r = rep;
+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
+					/* be careful */
+					if (firstn || numrep >= in->size)
+						/* r' = r + f_total */
+						r += ftotal;
+					else if (in->size % numrep == 0)
+						/* r'=r+(n+1)*f_local */
+						r += (numrep+1) *
+							(flocal+ftotal);
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				} else {
+					if (firstn)
+						/* r' = r + f_total */
+						r += ftotal;
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				}
+
+				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
+				if (flocal >= (in->size>>1) &&
+				    flocal > orig_tries)
+					item = bucket_perm_choose(in, x, r);
+				else
+					item = crush_bucket_choose(in, x, r);
+				BUG_ON(item >= map->max_devices);
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					BUG_ON(item >= 0 ||
+					       (-1-item) >= map->max_buckets);
+					in = map->buckets[-1-item];
+					retry_bucket = 1;
+					continue;
+				}
+
+				/* collision? */
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				reject = 0;
+				if (recurse_to_leaf) {
+					if (item < 0) {
+						if (crush_choose(map,
+							 map->buckets[-1-item],
+							 weight,
+							 x, outpos+1, 0,
+							 out2, outpos,
+							 firstn, 0,
+							 NULL) <= outpos)
+							/* didn't get leaf */
+							reject = 1;
+					} else {
+						/* we already have a leaf! */
+						out2[outpos] = item;
+					}
+				}
+
+				if (!reject) {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								item, x);
+					else
+						reject = 0;
+				}
+
+reject:
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal < 3)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (flocal < in->size + orig_tries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < 20)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %d  flocal %d\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("CHOOSE got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+	}
+
+	dprintk("CHOOSE returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  int force, __u32 *weight)
+{
+	int result_len;
+	int force_context[CRUSH_MAX_DEPTH];
+	int force_pos = -1;
+	int a[CRUSH_MAX_SET];
+	int b[CRUSH_MAX_SET];
+	int c[CRUSH_MAX_SET];
+	int recurse_to_leaf;
+	int *w;
+	int wsize = 0;
+	int *o;
+	int osize;
+	int *tmp;
+	struct crush_rule *rule;
+	int step;
+	int i, j;
+	int numrep;
+	int firstn;
+	int rc = -1;
+
+	BUG_ON(ruleno >= map->max_rules);
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+	w = a;
+	o = b;
+
+	/*
+	 * determine hierarchical context of force, if any.  note
+	 * that this may or may not correspond to the specific types
+	 * referenced by the crush rule.
+	 */
+	if (force >= 0) {
+		if (force >= map->max_devices ||
+		    map->device_parents[force] == 0) {
+			/*dprintk("CRUSH: forcefed device dne\n");*/
+			rc = -1;  /* force fed device dne */
+			goto out;
+		}
+		if (!is_out(map, weight, force, x)) {
+			while (1) {
+				force_context[++force_pos] = force;
+				if (force >= 0)
+					force = map->device_parents[force];
+				else
+					force = map->bucket_parents[-1-force];
+				if (force == 0)
+					break;
+			}
+		}
+	}
+
+	for (step = 0; step < rule->len; step++) {
+		firstn = 0;
+		switch (rule->steps[step].op) {
+		case CRUSH_RULE_TAKE:
+			w[0] = rule->steps[step].arg1;
+			if (force_pos >= 0) {
+				BUG_ON(force_context[force_pos] != w[0]);
+				force_pos--;
+			}
+			wsize = 1;
+			break;
+
+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			BUG_ON(wsize == 0);
+
+			recurse_to_leaf =
+				rule->steps[step].op ==
+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+				rule->steps[step].op ==
+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				/*
+				 * see CRUSH_N, CRUSH_N_MINUS macros.
+				 * basically, numrep <= 0 means relative to
+				 * the provided result_max
+				 */
+				numrep = rule->steps[step].arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				if (osize == 0 && force_pos >= 0) {
+					/* skip any intermediate types */
+					while (force_pos &&
+					       force_context[force_pos] < 0 &&
+					       rule->steps[step].arg2 !=
+					       map->buckets[-1 -
+					       force_context[force_pos]]->type)
+						force_pos--;
+					o[osize] = force_context[force_pos];
+					if (recurse_to_leaf)
+						c[osize] = force_context[0];
+					j++;
+					force_pos--;
+				}
+				osize += crush_choose(map,
+						      map->buckets[-1-w[i]],
+						      weight,
+						      x, numrep,
+						      rule->steps[step].arg2,
+						      o+osize, j,
+						      firstn,
+						      recurse_to_leaf, c+osize);
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap t and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			BUG_ON(1);
+		}
+	}
+	rc = result_len;
+
+out:
+	return rc;
+}
+
+
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 000000000000..7b505b0c983f
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	if (*p + sizeof(u16) + sizeof(key->created) +
+	    sizeof(u16) + key->len > end)
+		return -ERANGE;
+	ceph_encode_16(p, key->type);
+	ceph_encode_copy(p, &key->created, sizeof(key->created));
+	ceph_encode_16(p, key->len);
+	ceph_encode_copy(p, key->key, key->len);
+	return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+	key->type = ceph_decode_16(p);
+	ceph_decode_copy(p, &key->created, sizeof(key->created));
+	key->len = ceph_decode_16(p);
+	ceph_decode_need(p, end, key->len, bad);
+	key->key = kmalloc(key->len, GFP_NOFS);
+	if (!key->key)
+		return -ENOMEM;
+	ceph_decode_copy(p, key->key, key->len);
+	return 0;
+
+bad:
+	dout("failed to decode crypto key\n");
+	return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+	int inlen = strlen(inkey);
+	int blen = inlen * 3 / 4;
+	void *buf, *p;
+	int ret;
+
+	dout("crypto_key_unarmor %s\n", inkey);
+	buf = kmalloc(blen, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
+	if (blen < 0) {
+		kfree(buf);
+		return blen;
+	}
+
+	p = buf;
+	ret = ceph_crypto_key_decode(key, &p, p + blen);
+	kfree(buf);
+	if (ret)
+		return ret;
+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
+	     key->type, key->len);
+	return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+static int ceph_aes_encrypt(const void *key, int key_len,
+			    void *dst, size_t *dst_len,
+			    const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[2], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - (src_len & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 2);
+	sg_set_buf(&sg_in[0], src, src_len);
+	sg_set_buf(&sg_in[1], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+			src, src_len, 1);
+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
+			     size_t *dst_len,
+			     const void *src1, size_t src1_len,
+			     const void *src2, size_t src2_len)
+{
+	struct scatterlist sg_in[3], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src1_len + src2_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 3);
+	sg_set_buf(&sg_in[0], src1, src1_len);
+	sg_set_buf(&sg_in[1], src2, src2_len);
+	sg_set_buf(&sg_in[2], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+			src1, src1_len, 1);
+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+			src2, src2_len, 1);
+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src1_len + src2_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_decrypt(const void *key, int key_len,
+			    void *dst, size_t *dst_len,
+			    const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[2];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 1);
+	sg_init_table(sg_out, 2);
+	sg_set_buf(sg_in, src, src_len);
+	sg_set_buf(&sg_out[0], dst, *dst_len);
+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst_len)
+		last_byte = ((char *)dst)[src_len - 1];
+	else
+		last_byte = pad[src_len - *dst_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		*dst_len = src_len - last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_decrypt2(const void *key, int key_len,
+			     void *dst1, size_t *dst1_len,
+			     void *dst2, size_t *dst2_len,
+			     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[3];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src, src_len);
+	sg_init_table(sg_out, 3);
+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst1_len)
+		last_byte = ((char *)dst1)[src_len - 1];
+	else if (src_len <= *dst1_len + *dst2_len)
+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+	else
+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		src_len -= last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+
+	if (src_len < *dst1_len) {
+		*dst1_len = src_len;
+		*dst2_len = 0;
+	} else {
+		*dst2_len = src_len - *dst1_len;
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst1, *dst1_len, 1);
+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst2, *dst2_len, 1);
+	*/
+
+	return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len)
+{
+	size_t t;
+
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst1_len + *dst2_len < src_len)
+			return -ERANGE;
+		t = min(*dst1_len, src_len);
+		memcpy(dst1, src, t);
+		*dst1_len = t;
+		src += t;
+		src_len -= t;
+		if (src_len) {
+			t = min(*dst2_len, src_len);
+			memcpy(dst2, src, t);
+			*dst2_len = t;
+		}
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt2(secret->key, secret->len,
+					 dst1, dst1_len, dst2, dst2_len,
+					 src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		  const void *src1, size_t src1_len,
+		  const void *src2, size_t src2_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src1_len + src2_len)
+			return -ERANGE;
+		memcpy(dst, src1, src1_len);
+		memcpy(dst + src1_len, src2, src2_len);
+		*dst_len = src1_len + src2_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+					 src1, src1_len, src2, src2_len);
+
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 000000000000..f9eccace592b
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+	int type;
+	struct ceph_timespec created;
+	int len;
+	void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+	kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+			 void *dst, size_t *dst_len,
+			 const void *src1, size_t src1_len,
+			 const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..33d04999f4f2
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,268 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   ceph_pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+	struct rb_node *n;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pool =
+			rb_entry(n, struct ceph_pg_pool_info, node);
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, ceph_pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = -ENOMEM;
+	char name[80];
+
+	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+		 client->monc.auth->global_id);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_dir);
+}
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client,
+			     int (*module_debugfs_init)(struct ceph_client *))
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 000000000000..0e8157ee5d43
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+{
+	int i;
+	char *s;
+	struct sockaddr_in *in4 = (void *)ss;
+	struct sockaddr_in6 *in6 = (void *)ss;
+
+	spin_lock(&addr_str_lock);
+	i = last_addr_str++;
+	if (last_addr_str == MAX_ADDR_STR)
+		last_addr_str = 0;
+	spin_unlock(&addr_str_lock);
+	s = addr_str[i];
+
+	switch (ss->ss_family) {
+	case AF_INET:
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+			 (unsigned int)ntohs(in4->sin_port));
+		break;
+
+	case AF_INET6:
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+			 (unsigned int)ntohs(in6->sin6_port));
+		break;
+
+	default:
+		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+	}
+
+	return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+	ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int ceph_msgr_init(void)
+{
+	ceph_msgr_wq = create_workqueue("ceph-msgr");
+	if (IS_ERR(ceph_msgr_wq)) {
+		int ret = PTR_ERR(ceph_msgr_wq);
+		pr_err("msgr_init failed to create workqueue: %d\n", ret);
+		ceph_msgr_wq = NULL;
+		return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_msgr_init);
+
+void ceph_msgr_exit(void)
+{
+	destroy_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_exit);
+
+void ceph_msgr_flush(void)
+{
+	flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+	if (sk->sk_state != TCP_CLOSE_WAIT) {
+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		     con, con->state);
+		queue_con(con);
+	}
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	/* only queue to workqueue if there is data we want to write. */
+	if (test_bit(WRITE_PENDING, &con->state)) {
+		dout("ceph_write_space %p queueing write work\n", con);
+		queue_con(con);
+	} else {
+		dout("ceph_write_space %p nothing to write\n", con);
+	}
+
+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	     con, con->state, sk->sk_state);
+
+	if (test_bit(CLOSED, &con->state))
+		return;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		dout("ceph_state_change TCP_CLOSE\n");
+	case TCP_CLOSE_WAIT:
+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+			if (test_bit(CONNECTING, &con->state))
+				con->error_msg = "connection failed";
+			else
+				con->error_msg = "socket closed";
+			queue_con(con);
+		}
+		break;
+	case TCP_ESTABLISHED:
+		dout("ceph_state_change TCP_ESTABLISHED\n");
+		queue_con(con);
+		break;
+	}
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+			       struct ceph_connection *con)
+{
+	struct sock *sk = sock->sk;
+	sk->sk_user_data = (void *)con;
+	sk->sk_data_ready = ceph_data_ready;
+	sk->sk_write_space = ceph_write_space;
+	sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+	struct socket *sock;
+	int ret;
+
+	BUG_ON(con->sock);
+	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+			       IPPROTO_TCP, &sock);
+	if (ret)
+		return ERR_PTR(ret);
+	con->sock = sock;
+	sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+	set_sock_callbacks(sock, con);
+
+	dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+
+	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+				 O_NONBLOCK);
+	if (ret == -EINPROGRESS) {
+		dout("connect %s EINPROGRESS sk_state = %u\n",
+		     ceph_pr_addr(&con->peer_addr.in_addr),
+		     sock->sk->sk_state);
+		ret = 0;
+	}
+	if (ret < 0) {
+		pr_err("connect %s error %d\n",
+		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
+		sock_release(sock);
+		con->sock = NULL;
+		con->error_msg = "connect error";
+	}
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+	return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+		     size_t kvlen, size_t len, int more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+	int rc;
+
+	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	if (!con->sock)
+		return 0;
+	set_bit(SOCK_CLOSED, &con->state);
+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+	sock_release(con->sock);
+	con->sock = NULL;
+	clear_bit(SOCK_CLOSED, &con->state);
+	return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+	list_del_init(&msg->list_head);
+	ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+							list_head);
+		ceph_msg_remove(msg);
+	}
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+	/* reset connection, out_queue, msg_ and connect_seq */
+	/* discard existing out_queue and msg_seq */
+	ceph_msg_remove_list(&con->out_queue);
+	ceph_msg_remove_list(&con->out_sent);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	con->connect_seq = 0;
+	con->out_seq = 0;
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+	con->out_keepalive_pending = false;
+	con->in_seq = 0;
+	con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+	dout("con_close %p peer %s\n", con,
+	     ceph_pr_addr(&con->peer_addr.in_addr));
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->state);
+	mutex_lock(&con->mutex);
+	reset_connection(con);
+	con->peer_global_seq = 0;
+	cancel_delayed_work(&con->work);
+	mutex_unlock(&con->mutex);
+	queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+	set_bit(OPENING, &con->state);
+	clear_bit(CLOSED, &con->state);
+	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	con->delay = 0;      /* reset backoff memory */
+	queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+	dout("con_get %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+	if (atomic_inc_not_zero(&con->nref))
+		return con;
+	return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+	dout("con_put %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+	BUG_ON(atomic_read(&con->nref) == 0);
+	if (atomic_dec_and_test(&con->nref)) {
+		BUG_ON(con->sock);
+		kfree(con);
+	}
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+	dout("con_init %p\n", con);
+	memset(con, 0, sizeof(*con));
+	atomic_set(&con->nref, 1);
+	con->msgr = msgr;
+	mutex_init(&con->mutex);
+	INIT_LIST_HEAD(&con->out_queue);
+	INIT_LIST_HEAD(&con->out_sent);
+	INIT_DELAYED_WORK(&con->work, con_work);
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+	u32 ret;
+
+	spin_lock(&msgr->global_seq_lock);
+	if (msgr->global_seq < gt)
+		msgr->global_seq = gt;
+	ret = ++msgr->global_seq;
+	spin_unlock(&msgr->global_seq_lock);
+	return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con->out_kvec_is_msg = true;
+	con->out_kvec[v].iov_base = &m->footer;
+	con->out_kvec[v].iov_len = sizeof(m->footer);
+	con->out_kvec_bytes += sizeof(m->footer);
+	con->out_kvec_left++;
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	int v = 0;
+
+	con->out_kvec_bytes = 0;
+	con->out_kvec_is_msg = true;
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con->out_kvec[v].iov_base = &tag_ack;
+		con->out_kvec[v++].iov_len = 1;
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con->out_kvec[v].iov_base = &con->out_temp_ack;
+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	}
+
+	m = list_first_entry(&con->out_queue,
+		       struct ceph_msg, list_head);
+	con->out_msg = m;
+	if (test_bit(LOSSYTX, &con->state)) {
+		list_del_init(&m->list_head);
+	} else {
+		/* put message on sent list */
+		ceph_msg_get(m);
+		list_move_tail(&m->list_head, &con->out_sent);
+	}
+
+	/*
+	 * only assign outgoing seq # if we haven't sent this message
+	 * yet.  if it is requeued, resend with it's original seq.
+	 */
+	if (m->needs_out_seq) {
+		m->hdr.seq = cpu_to_le64(++con->out_seq);
+		m->needs_out_seq = false;
+	}
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     le32_to_cpu(m->hdr.data_len),
+	     m->nr_pages);
+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+	/* tag + hdr + front + middle */
+	con->out_kvec[v].iov_base = &tag_msg;
+	con->out_kvec[v++].iov_len = 1;
+	con->out_kvec[v].iov_base = &m->hdr;
+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
+	con->out_kvec[v++] = m->front;
+	if (m->middle)
+		con->out_kvec[v++] = m->middle->vec;
+	con->out_kvec_left = v;
+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+		(m->middle ? m->middle->vec.iov_len : 0);
+	con->out_kvec_cur = con->out_kvec;
+
+	/* fill in crc (except data pages), footer */
+	con->out_msg->hdr.crc =
+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.front_crc =
+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+	if (m->middle)
+		con->out_msg->footer.middle_crc =
+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+					   m->middle->vec.iov_len));
+	else
+		con->out_msg->footer.middle_crc = 0;
+	con->out_msg->footer.data_crc = 0;
+	dout("prepare_write_message front_crc %u data_crc %u\n",
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+
+	/* is there a data payload? */
+	if (le32_to_cpu(m->hdr.data_len) > 0) {
+		/* initialize page iterator */
+		con->out_msg_pos.page = 0;
+		if (m->pages)
+			con->out_msg_pos.page_pos =
+				le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		else
+			con->out_msg_pos.page_pos = 0;
+		con->out_msg_pos.data_pos = 0;
+		con->out_msg_pos.did_page_crc = 0;
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con, v);
+	}
+
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con->out_kvec[0].iov_base = &tag_ack;
+	con->out_kvec[0].iov_len = 1;
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con->out_kvec[1].iov_base = &con->out_temp_ack;
+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con->out_kvec[0].iov_base = &tag_keepalive;
+	con->out_kvec[0].iov_len = 1;
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = 1;
+	con->out_kvec_cur = con->out_kvec;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+	void *auth_buf;
+	int auth_len = 0;
+	int auth_protocol = 0;
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->get_authorizer)
+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
+					 &auth_protocol, &con->auth_reply_buf,
+					 &con->auth_reply_buf_len,
+					 con->auth_retry);
+	mutex_lock(&con->mutex);
+
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+				 struct ceph_connection *con)
+{
+	int len = strlen(CEPH_BANNER);
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+				  struct ceph_connection *con,
+				  int after_banner)
+{
+	unsigned global_seq = get_global_seq(con->msgr, 0);
+	int proto;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features = cpu_to_le64(msgr->supported_features);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	if (!after_banner) {
+		con->out_kvec_left = 0;
+		con->out_kvec_bytes = 0;
+	}
+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left++;
+	con->out_kvec_bytes += sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+
+	prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+		while (ret > 0) {
+			if (ret >= con->out_kvec_cur->iov_len) {
+				ret -= con->out_kvec_cur->iov_len;
+				con->out_kvec_cur++;
+				con->out_kvec_left--;
+			} else {
+				con->out_kvec_cur->iov_len -= ret;
+				con->out_kvec_cur->iov_base += ret;
+				ret = 0;
+				break;
+			}
+		}
+	}
+	con->out_kvec_left = 0;
+	con->out_kvec_is_msg = false;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+	if (!bio) {
+		*iter = NULL;
+		*seg = 0;
+		return;
+	}
+	*iter = bio;
+	*seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+	if (*bio_iter == NULL)
+		return;
+
+	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+	(*seg)++;
+	if (*seg == (*bio_iter)->bi_vcnt)
+		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+	size_t len;
+	int crc = con->msgr->nocrc;
+	int ret;
+	int total_max_write;
+	int in_trail = 0;
+	size_t trail_len = (msg->trail ? msg->trail->length : 0);
+
+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con->out_msg_pos.page_pos);
+
+#ifdef CONFIG_BLOCK
+	if (msg->bio && !msg->bio_iter)
+		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+	while (data_len > con->out_msg_pos.data_pos) {
+		struct page *page = NULL;
+		void *kaddr = NULL;
+		int max_write = PAGE_SIZE;
+		int page_shift = 0;
+
+		total_max_write = data_len - trail_len -
+			con->out_msg_pos.data_pos;
+
+		/*
+		 * if we are calculating the data crc (the default), we need
+		 * to map the page.  if our pages[] has been revoked, use the
+		 * zero page.
+		 */
+
+		/* have we reached the trail part of the data? */
+		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+			in_trail = 1;
+
+			total_max_write = data_len - con->out_msg_pos.data_pos;
+
+			page = list_first_entry(&msg->trail->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+			max_write = PAGE_SIZE;
+		} else if (msg->pages) {
+			page = msg->pages[con->out_msg_pos.page];
+			if (crc)
+				kaddr = kmap(page);
+		} else if (msg->pagelist) {
+			page = list_first_entry(&msg->pagelist->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+		} else if (msg->bio) {
+			struct bio_vec *bv;
+
+			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+			page = bv->bv_page;
+			page_shift = bv->bv_offset;
+			if (crc)
+				kaddr = kmap(page) + page_shift;
+			max_write = bv->bv_len;
+#endif
+		} else {
+			page = con->msgr->zero_page;
+			if (crc)
+				kaddr = page_address(con->msgr->zero_page);
+		}
+		len = min_t(int, max_write - con->out_msg_pos.page_pos,
+			    total_max_write);
+
+		if (crc && !con->out_msg_pos.did_page_crc) {
+			void *base = kaddr + con->out_msg_pos.page_pos;
+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+			BUG_ON(kaddr == NULL);
+			con->out_msg->footer.data_crc =
+				cpu_to_le32(crc32c(tmpcrc, base, len));
+			con->out_msg_pos.did_page_crc = 1;
+		}
+		ret = kernel_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos + page_shift,
+				      len,
+				      MSG_DONTWAIT | MSG_NOSIGNAL |
+				      MSG_MORE);
+
+		if (crc &&
+		    (msg->pages || msg->pagelist || msg->bio || in_trail))
+			kunmap(page);
+
+		if (ret <= 0)
+			goto out;
+
+		con->out_msg_pos.data_pos += ret;
+		con->out_msg_pos.page_pos += ret;
+		if (ret == len) {
+			con->out_msg_pos.page_pos = 0;
+			con->out_msg_pos.page++;
+			con->out_msg_pos.did_page_crc = 0;
+			if (in_trail)
+				list_move_tail(&page->lru,
+					       &msg->trail->head);
+			else if (msg->pagelist)
+				list_move_tail(&page->lru,
+					       &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+			else if (msg->bio)
+				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
+		}
+	}
+
+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+	/* prepare and queue up footer, too */
+	if (!crc)
+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_left = 0;
+	con->out_kvec_cur = con->out_kvec;
+	prepare_write_message_footer(con, 0);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int ret;
+
+	while (con->out_skip > 0) {
+		struct kvec iov = {
+			.iov_base = page_address(con->msgr->zero_page),
+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+		};
+
+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+			int *to, int size, void *object)
+{
+	*to += size;
+	while (con->in_base_pos < *to) {
+		int left = *to - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+			   &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+			   &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+			   con->auth_reply_buf);
+	if (ret <= 0)
+		goto out;
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr.in_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+	case AF_INET6:
+		return
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+	}
+	return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+	}
+	return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)ss)->sin_port = htons(p);
+	case AF_INET6:
+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+	}
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+		   struct ceph_entity_addr *addr,
+		   int max_count, int *count)
+{
+	int i;
+	const char *p = c;
+
+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+	for (i = 0; i < max_count; i++) {
+		const char *ipend;
+		struct sockaddr_storage *ss = &addr[i].in_addr;
+		struct sockaddr_in *in4 = (void *)ss;
+		struct sockaddr_in6 *in6 = (void *)ss;
+		int port;
+		char delim = ',';
+
+		if (*p == '[') {
+			delim = ']';
+			p++;
+		}
+
+		memset(ss, 0, sizeof(*ss));
+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+			     delim, &ipend))
+			ss->ss_family = AF_INET;
+		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				  delim, &ipend))
+			ss->ss_family = AF_INET6;
+		else
+			goto bad;
+		p = ipend;
+
+		if (delim == ']') {
+			if (*p != ']') {
+				dout("missing matching ']'\n");
+				goto bad;
+			}
+			p++;
+		}
+
+		/* port? */
+		if (p < end && *p == ':') {
+			port = 0;
+			p++;
+			while (p < end && *p >= '0' && *p <= '9') {
+				port = (port * 10) + (*p - '0');
+				p++;
+			}
+			if (port > 65535 || port == 0)
+				goto bad;
+		} else {
+			port = CEPH_MON_PORT;
+		}
+
+		addr_set_port(ss, port);
+
+		dout("parse_ips got %s\n", ceph_pr_addr(ss));
+
+		if (p == end)
+			break;
+		if (*p != ',')
+			goto bad;
+		p++;
+	}
+
+	if (p != end)
+		goto bad;
+
+	if (count)
+		*count = i + 1;
+	return 0;
+
+bad:
+	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_parse_ips);
+
+static int process_banner(struct ceph_connection *con)
+{
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_addr(&con->peer_addr_for_me);
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warning("wrong peer, want %s/%d, got %s/%d\n",
+			   ceph_pr_addr(&con->peer_addr.in_addr),
+			   (int)le32_to_cpu(con->peer_addr.nonce),
+			   ceph_pr_addr(&con->actual_peer_addr.in_addr),
+			   (int)le32_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+		int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+		memcpy(&con->msgr->inst.addr.in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+	}
+
+	set_bit(NEGOTIATING, &con->state);
+	prepare_read_connect(con);
+	return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+	reset_connection(con);
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->bad_proto)
+		con->ops->bad_proto(con);
+	mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = con->msgr->supported_features;
+	u64 req_feat = con->msgr->required_features;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			reset_connection(con);
+			set_bit(CLOSED, &con->state);
+			return -1;
+		}
+		con->auth_retry = 1;
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_connect.connect_seq));
+		pr_err("%s%lld %s connection reset\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr));
+		reset_connection(con);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_connect.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_connect.global_seq));
+		get_global_seq(con->msgr,
+			       le32_to_cpu(con->in_connect.global_seq));
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr.in_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			fail_protocol(con);
+			return -1;
+		}
+		clear_bit(CONNECTING, &con->state);
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYTX, &con->state);
+
+		prepare_read_tag(con);
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		pr_err("process_connect peer connecting WAIT\n");
+
+	default:
+		pr_err("connect protocol error, will retry\n");
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int to = 0;
+
+	return read_partial(con, &to, sizeof(con->in_temp_ack),
+			    &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 seq;
+
+	while (!list_empty(&con->out_sent)) {
+		m = list_first_entry(&con->out_sent, struct ceph_msg,
+				     list_head);
+		seq = le64_to_cpu(m->hdr.seq);
+		if (seq > ack)
+			break;
+		dout("got ack for seq %llu type %d at %p\n", seq,
+		     le16_to_cpu(m->hdr.type), m);
+		ceph_msg_remove(m);
+	}
+	prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+		if (section->iov_len == sec_len)
+			*crc = crc32c(0, section->iov_base,
+				      section->iov_len);
+	}
+
+	return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+				      struct page **pages,
+				      unsigned data_len, int datacrc)
+{
+	void *p;
+	int ret;
+	int left;
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+	/* (page) data */
+	BUG_ON(pages == NULL);
+	p = kmap(pages[con->in_msg_pos.page]);
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(pages[con->in_msg_pos.page]);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+		con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.page++;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+				    struct bio **bio_iter, int *bio_seg,
+				    unsigned data_len, int datacrc)
+{
+	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+	void *p;
+	int ret, left;
+
+	if (IS_ERR(bv))
+		return PTR_ERR(bv);
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+	p = kmap(bv->bv_page) + bv->bv_offset;
+
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(bv->bv_page);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == bv->bv_len) {
+		con->in_msg_pos.page_pos = 0;
+		iter_bio_next(bio_iter, bio_seg);
+	}
+
+	return ret;
+}
+#endif
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int ret;
+	int to, left;
+	unsigned front_len, middle_len, data_len, data_off;
+	int datacrc = con->msgr->nocrc;
+	int skip;
+	u64 seq;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	while (con->in_base_pos < sizeof(con->in_hdr)) {
+		left = sizeof(con->in_hdr) - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)&con->in_hdr + con->in_base_pos,
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+		if (con->in_base_pos == sizeof(con->in_hdr)) {
+			u32 crc = crc32c(0, (void *)&con->in_hdr,
+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
+				pr_err("read_partial_message bad hdr "
+				       " crc %u != expected %u\n",
+				       crc, con->in_hdr.crc);
+				return -EBADMSG;
+			}
+		}
+	}
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_off = le16_to_cpu(con->in_hdr.data_off);
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr.in_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof(m->footer);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+		return 0;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADMSG;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     con->in_hdr.front_len, con->in_hdr.data_len);
+		skip = 0;
+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			BUG_ON(con->in_msg);
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof(m->footer);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 0;
+		}
+		if (!con->in_msg) {
+			con->error_msg =
+				"error allocating memory for incoming message";
+			return -ENOMEM;
+		}
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		con->in_msg_pos.page = 0;
+		if (m->pages)
+			con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		else
+			con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.data_pos = 0;
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+#ifdef CONFIG_BLOCK
+	if (m->bio && !m->bio_iter)
+		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
+
+	/* (page) data */
+	while (con->in_msg_pos.data_pos < data_len) {
+		if (m->pages) {
+			ret = read_partial_message_pages(con, m->pages,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#ifdef CONFIG_BLOCK
+		} else if (m->bio) {
+
+			ret = read_partial_message_bio(con,
+						 &m->bio_iter, &m->bio_seg,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#endif
+		} else {
+			BUG_ON(1);
+		}
+	}
+
+	/* footer */
+	to = sizeof(m->hdr) + sizeof(m->footer);
+	while (con->in_base_pos < to) {
+		left = to - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+				       (con->in_base_pos - sizeof(m->hdr)),
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	msg = con->in_msg;
+	con->in_msg = NULL;
+
+	/* if first message, set peer_name */
+	if (con->peer_name.type == 0)
+		con->peer_name = msg->hdr.src;
+
+	con->in_seq++;
+	mutex_unlock(&con->mutex);
+
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	     msg, le64_to_cpu(msg->hdr.seq),
+	     ENTITY_NAME(msg->hdr.src),
+	     le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.data_len),
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+	con->ops->dispatch(con, msg);
+
+	mutex_lock(&con->mutex);
+	prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr = con->msgr;
+	int ret = 1;
+
+	dout("try_write start %p state %lu nref %d\n", con, con->state,
+	     atomic_read(&con->nref));
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+	/* open the socket first? */
+	if (con->sock == NULL) {
+		/*
+		 * if we were STANDBY and are reconnecting _this_
+		 * connection, bump connect_seq now.  Always bump
+		 * global_seq.
+		 */
+		if (test_and_clear_bit(STANDBY, &con->state))
+			con->connect_seq++;
+
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con, 1);
+		prepare_read_banner(con);
+		set_bit(CONNECTING, &con->state);
+		clear_bit(NEGOTIATING, &con->state);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %lu\n",
+		     con, con->state);
+		con->sock = ceph_tcp_connect(con);
+		if (IS_ERR(con->sock)) {
+			con->sock = NULL;
+			con->error_msg = "connect error";
+			ret = -1;
+			goto out;
+		}
+	}
+
+more_kvec:
+	/* kvec data queued? */
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_skip err %d\n", ret);
+			goto done;
+		}
+	}
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto done;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_msg_pages(con);
+		if (ret == 1)
+			goto more_kvec;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_msg_pages err %d\n",
+			     ret);
+			goto done;
+		}
+	}
+
+do_next:
+	if (!test_bit(CONNECTING, &con->state)) {
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	clear_bit(WRITE_PENDING, &con->state);
+	dout("try_write nothing else to write.\n");
+done:
+	ret = 0;
+out:
+	dout("try_write done on %p\n", con);
+	return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+	if (!con->sock)
+		return 0;
+
+	if (test_bit(STANDBY, &con->state))
+		return 0;
+
+	dout("try_read start on %p\n", con);
+
+more:
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+	if (test_bit(CONNECTING, &con->state)) {
+		if (!test_bit(NEGOTIATING, &con->state)) {
+			dout("try_read connecting\n");
+			ret = read_partial_banner(con);
+			if (ret <= 0)
+				goto done;
+			if (process_banner(con) < 0) {
+				ret = -1;
+				goto out;
+			}
+		}
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto done;
+		if (process_connect(con) < 0) {
+			ret = -1;
+			goto out;
+		}
+		goto more;
+	}
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 *
+		 * FIXME: there must be a better way to do this!
+		 */
+		static char buf[1024];
+		int skip = min(1024, -con->in_base_pos);
+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+		if (ret <= 0)
+			goto done;
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto done;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			set_bit(CLOSED, &con->state);   /* fixme */
+			goto done;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc";
+				ret = -EIO;
+				goto out;
+			case -EIO:
+				con->error_msg = "io error";
+				goto out;
+			default:
+				goto done;
+			}
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		process_message(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto done;
+		process_ack(con);
+		goto more;
+	}
+
+done:
+	ret = 0;
+out:
+	dout("try_read done on %p\n", con);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+	if (test_bit(DEAD, &con->state)) {
+		dout("queue_con %p ignoring: DEAD\n",
+		     con);
+		return;
+	}
+
+	if (!con->ops->get(con)) {
+		dout("queue_con %p ref count 0\n", con);
+		return;
+	}
+
+	set_bit(QUEUED, &con->state);
+	if (test_bit(BUSY, &con->state)) {
+		dout("queue_con %p - already BUSY\n", con);
+		con->ops->put(con);
+	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+		dout("queue_con %p - already queued\n", con);
+		con->ops->put(con);
+	} else {
+		dout("queue_con %p\n", con);
+	}
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+	struct ceph_connection *con = container_of(work, struct ceph_connection,
+						   work.work);
+	int backoff = 0;
+
+more:
+	if (test_and_set_bit(BUSY, &con->state) != 0) {
+		dout("con_work %p BUSY already set\n", con);
+		goto out;
+	}
+	dout("con_work %p start, clearing QUEUED\n", con);
+	clear_bit(QUEUED, &con->state);
+
+	mutex_lock(&con->mutex);
+
+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+		dout("con_work CLOSED\n");
+		con_close_socket(con);
+		goto done;
+	}
+	if (test_and_clear_bit(OPENING, &con->state)) {
+		/* reopen w/ new peer */
+		dout("con_work OPENING\n");
+		con_close_socket(con);
+	}
+
+	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+	    try_read(con) < 0 ||
+	    try_write(con) < 0) {
+		mutex_unlock(&con->mutex);
+		backoff = 1;
+		ceph_fault(con);     /* error/fault path */
+		goto done_unlocked;
+	}
+
+done:
+	mutex_unlock(&con->mutex);
+
+done_unlocked:
+	clear_bit(BUSY, &con->state);
+	dout("con->state=%lu\n", con->state);
+	if (test_bit(QUEUED, &con->state)) {
+		if (!backoff || test_bit(OPENING, &con->state)) {
+			dout("con_work %p QUEUED reset, looping\n", con);
+			goto more;
+		}
+		dout("con_work %p QUEUED reset, but just faulted\n", con);
+		clear_bit(QUEUED, &con->state);
+	}
+	dout("con_work %p done\n", con);
+
+out:
+	con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	dout("fault %p state %lu to peer %s\n",
+	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+	if (test_bit(LOSSYTX, &con->state)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out;
+	}
+
+	mutex_lock(&con->mutex);
+	if (test_bit(CLOSED, &con->state))
+		goto out_unlock;
+
+	con_close_socket(con);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	/* Requeue anything that hasn't been acked */
+	list_splice_init(&con->out_sent, &con->out_queue);
+
+	/* If there are no messages in the queue, place the connection
+	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
+	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+		dout("fault setting STANDBY\n");
+		set_bit(STANDBY, &con->state);
+	} else {
+		/* retry after a delay. */
+		if (con->delay == 0)
+			con->delay = BASE_DELAY_INTERVAL;
+		else if (con->delay < MAX_DELAY_INTERVAL)
+			con->delay *= 2;
+		dout("fault queueing %p delay %lu\n", con, con->delay);
+		con->ops->get(con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay)) == 0)
+			con->ops->put(con);
+	}
+
+out_unlock:
+	mutex_unlock(&con->mutex);
+out:
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+	 */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+					     u32 supported_features,
+					     u32 required_features)
+{
+	struct ceph_messenger *msgr;
+
+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+	if (msgr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	msgr->supported_features = supported_features;
+	msgr->required_features = required_features;
+
+	spin_lock_init(&msgr->global_seq_lock);
+
+	/* the zero page is needed if a request is "canceled" while the message
+	 * is being written over the socket */
+	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+	if (!msgr->zero_page) {
+		kfree(msgr);
+		return ERR_PTR(-ENOMEM);
+	}
+	kmap(msgr->zero_page);
+
+	if (myaddr)
+		msgr->inst.addr = *myaddr;
+
+	/* select a random nonce */
+	msgr->inst.addr.type = 0;
+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+	encode_my_addr(msgr);
+
+	dout("messenger_create %p\n", msgr);
+	return msgr;
+}
+EXPORT_SYMBOL(ceph_messenger_create);
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+	dout("destroy %p\n", msgr);
+	kunmap(msgr->zero_page);
+	__free_page(msgr->zero_page);
+	kfree(msgr);
+	dout("destroyed messenger %p\n", msgr);
+}
+EXPORT_SYMBOL(ceph_messenger_destroy);
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_send %p closed, dropping %p\n", con, msg);
+		ceph_msg_put(msg);
+		return;
+	}
+
+	/* set src+dst */
+	msg->hdr.src = con->msgr->inst.name;
+
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+	msg->needs_out_seq = true;
+
+	/* queue */
+	mutex_lock(&con->mutex);
+	BUG_ON(!list_empty(&msg->list_head));
+	list_add_tail(&msg->list_head, &con->out_queue);
+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
+	     le32_to_cpu(msg->hdr.data_len));
+	mutex_unlock(&con->mutex);
+
+	/* if there wasn't anything waiting to send before, queue
+	 * new work */
+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (!list_empty(&msg->list_head)) {
+		dout("con_revoke %p msg %p - was on queue\n", con, msg);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	if (con->out_msg == msg) {
+		dout("con_revoke %p msg %p - was sending\n", con, msg);
+		con->out_msg = NULL;
+		if (con->out_kvec_is_msg) {
+			con->out_skip = con->out_kvec_bytes;
+			con->out_kvec_is_msg = false;
+		}
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (con->in_msg && con->in_msg == msg) {
+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+		/* skip rest of message */
+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+			con->in_base_pos = con->in_base_pos -
+				sizeof(struct ceph_msg_header) -
+				front_len -
+				middle_len -
+				data_len -
+				sizeof(struct ceph_msg_footer);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+	} else {
+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
+		     con, con->in_msg, msg);
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+{
+	struct ceph_msg *m;
+
+	m = kmalloc(sizeof(*m), flags);
+	if (m == NULL)
+		goto out;
+	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->list_head);
+
+	m->hdr.tid = 0;
+	m->hdr.type = cpu_to_le16(type);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->hdr.version = 0;
+	m->hdr.front_len = cpu_to_le32(front_len);
+	m->hdr.middle_len = 0;
+	m->hdr.data_len = 0;
+	m->hdr.data_off = 0;
+	m->hdr.reserved = 0;
+	m->footer.front_crc = 0;
+	m->footer.middle_crc = 0;
+	m->footer.data_crc = 0;
+	m->footer.flags = 0;
+	m->front_max = front_len;
+	m->front_is_vmalloc = false;
+	m->more_to_follow = false;
+	m->pool = NULL;
+
+	/* front */
+	if (front_len) {
+		if (front_len > PAGE_CACHE_SIZE) {
+			m->front.iov_base = __vmalloc(front_len, flags,
+						      PAGE_KERNEL);
+			m->front_is_vmalloc = true;
+		} else {
+			m->front.iov_base = kmalloc(front_len, flags);
+		}
+		if (m->front.iov_base == NULL) {
+			pr_err("msg_new can't allocate %d bytes\n",
+			     front_len);
+			goto out2;
+		}
+	} else {
+		m->front.iov_base = NULL;
+	}
+	m->front.iov_len = front_len;
+
+	/* middle */
+	m->middle = NULL;
+
+	/* data */
+	m->nr_pages = 0;
+	m->pages = NULL;
+	m->pagelist = NULL;
+	m->bio = NULL;
+	m->bio_iter = NULL;
+	m->bio_seg = 0;
+	m->trail = NULL;
+
+	dout("ceph_msg_new %p front %d\n", m, front_len);
+	return m;
+
+out2:
+	ceph_msg_put(m);
+out:
+	pr_err("msg_new can't create type %d front %d\n", type, front_len);
+	return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int type = le16_to_cpu(msg->hdr.type);
+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+	     ceph_msg_type_name(type), middle_len);
+	BUG_ON(!middle_len);
+	BUG_ON(msg->middle);
+
+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+	if (!msg->middle)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg = NULL;
+	int ret;
+
+	if (con->ops->alloc_msg) {
+		mutex_unlock(&con->mutex);
+		msg = con->ops->alloc_msg(con, hdr, skip);
+		mutex_lock(&con->mutex);
+		if (!msg || *skip)
+			return NULL;
+	}
+	if (!msg) {
+		*skip = 0;
+		msg = ceph_msg_new(type, front_len, GFP_NOFS);
+		if (!msg) {
+			pr_err("unable to allocate msg type %d len %d\n",
+			       type, front_len);
+			return NULL;
+		}
+	}
+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+	if (middle_len && !msg->middle) {
+		ret = ceph_alloc_middle(con, msg);
+		if (ret < 0) {
+			ceph_msg_put(msg);
+			return NULL;
+		}
+	}
+
+	return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+	dout("msg_kfree %p\n", m);
+	if (m->front_is_vmalloc)
+		vfree(m->front.iov_base);
+	else
+		kfree(m->front.iov_base);
+	kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+	dout("ceph_msg_put last one on %p\n", m);
+	WARN_ON(!list_empty(&m->list_head));
+
+	/* drop middle, data, if any */
+	if (m->middle) {
+		ceph_buffer_put(m->middle);
+		m->middle = NULL;
+	}
+	m->nr_pages = 0;
+	m->pages = NULL;
+
+	if (m->pagelist) {
+		ceph_pagelist_release(m->pagelist);
+		kfree(m->pagelist);
+		m->pagelist = NULL;
+	}
+
+	m->trail = NULL;
+
+	if (m->pool)
+		ceph_msgpool_put(m->pool, m);
+	else
+		ceph_msg_kfree(m);
+}
+EXPORT_SYMBOL(ceph_msg_last_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+		 msg->front_max, msg->nr_pages);
+	print_hex_dump(KERN_DEBUG, "header: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->hdr, sizeof(msg->hdr), true);
+	print_hex_dump(KERN_DEBUG, " front: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       msg->front.iov_base, msg->front.iov_len, true);
+	if (msg->middle)
+		print_hex_dump(KERN_DEBUG, "middle: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       msg->middle->vec.iov_base,
+			       msg->middle->vec.iov_len, true);
+	print_hex_dump(KERN_DEBUG, "footer: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 000000000000..8a079399174a
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+	struct ceph_monmap *m = NULL;
+	int i, err = -EINVAL;
+	struct ceph_fsid fsid;
+	u32 epoch, num_mon;
+	u16 version;
+	u32 len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
+
+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+	ceph_decode_16_safe(&p, end, version, bad);
+
+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(&p);
+
+	num_mon = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+	if (num_mon >= CEPH_MAX_MON)
+		goto bad;
+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+	m->fsid = fsid;
+	m->epoch = epoch;
+	m->num_mon = num_mon;
+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+	for (i = 0; i < num_mon; i++)
+		ceph_decode_addr(&m->mon_inst[i].addr);
+
+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+	     m->num_mon);
+	for (i = 0; i < m->num_mon; i++)
+		dout("monmap_decode  mon%d is %s\n", i,
+		     ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+	return m;
+
+bad:
+	dout("monmap_decode failed with %d\n", err);
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+	int i;
+
+	for (i = 0; i < m->num_mon; i++)
+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+			return 1;
+	return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_con_revoke(monc->con, monc->m_auth);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+	if (monc->con) {
+		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_revoke(monc->con, monc->m_auth);
+		ceph_con_close(monc->con);
+		monc->cur_mon = -1;
+		monc->pending_auth = 0;
+		ceph_auth_reset(monc->auth);
+	}
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+	char r;
+	int ret;
+
+	if (monc->cur_mon < 0) {
+		get_random_bytes(&r, 1);
+		monc->cur_mon = r % monc->monmap->num_mon;
+		dout("open_session num=%d r=%d -> mon%d\n",
+		     monc->monmap->num_mon, r, monc->cur_mon);
+		monc->sub_sent = 0;
+		monc->sub_renew_after = jiffies;  /* i.e., expired */
+		monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(monc->con,
+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+		/* initiatiate authentication handshake */
+		ret = ceph_auth_build_hello(monc->auth,
+					    monc->m_auth->front.iov_base,
+					    monc->m_auth->front_max);
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		dout("open_session mon%d already open\n", monc->cur_mon);
+	}
+	return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+	return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+	unsigned delay;
+
+	if (monc->cur_mon < 0 || __sub_expired(monc))
+		delay = 10 * HZ;
+	else
+		delay = 20 * HZ;
+	dout("__schedule_delayed after %u\n", delay);
+	schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+	     (unsigned)monc->sub_sent, __sub_expired(monc),
+	     monc->want_next_osdmap);
+	if ((__sub_expired(monc) && !monc->sub_sent) ||
+	    monc->want_next_osdmap == 1) {
+		struct ceph_msg *msg = monc->m_subscribe;
+		struct ceph_mon_subscribe_item *i;
+		void *p, *end;
+		int num;
+
+		p = msg->front.iov_base;
+		end = p + msg->front_max;
+
+		num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+		ceph_encode_32(&p, num);
+
+		if (monc->want_next_osdmap) {
+			dout("__send_subscribe to 'osdmap' %u\n",
+			     (unsigned)monc->have_osdmap);
+			ceph_encode_string(&p, end, "osdmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_osdmap);
+			i->onetime = 1;
+			p += sizeof(*i);
+			monc->want_next_osdmap = 2;  /* requested */
+		}
+		if (monc->want_mdsmap) {
+			dout("__send_subscribe to 'mdsmap' %u+\n",
+			     (unsigned)monc->have_mdsmap);
+			ceph_encode_string(&p, end, "mdsmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_mdsmap);
+			i->onetime = 0;
+			p += sizeof(*i);
+		}
+		ceph_encode_string(&p, end, "monmap", 6);
+		i = p;
+		i->have = 0;
+		i->onetime = 0;
+		p += sizeof(*i);
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		ceph_con_revoke(monc->con, msg);
+		ceph_con_send(monc->con, ceph_msg_get(msg));
+
+		monc->sub_sent = jiffies | 1;  /* never 0 */
+	}
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	unsigned seconds;
+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	seconds = le32_to_cpu(h->duration);
+
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		pr_info("mon%d %s session established\n",
+			monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+		monc->hunting = false;
+	}
+	dout("handle_subscribe_ack after %d seconds\n", seconds);
+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+	monc->sub_sent = 0;
+	mutex_unlock(&monc->mutex);
+	return;
+bad:
+	pr_err("got corrupt subscribe-ack msg\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_mdsmap = got;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_osdmap = got;
+	monc->want_next_osdmap = 0;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
+	mutex_lock(&monc->mutex);
+	if (!monc->want_next_osdmap)
+		monc->want_next_osdmap = 1;
+	if (monc->want_next_osdmap < 2)
+		__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+	if (!monc->con) {
+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+		if (!monc->con)
+			return -ENOMEM;
+		ceph_con_init(monc->client->msgr, monc->con);
+		monc->con->private = monc;
+		monc->con->ops = &mon_con_ops;
+	}
+
+	mutex_lock(&monc->mutex);
+	__open_session(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_monmap\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	monmap = ceph_monmap_decode(p, end);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		goto out;
+	}
+
+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+		kfree(monmap);
+		goto out;
+	}
+
+	client->monc.monmap = monmap;
+	kfree(old);
+
+out:
+	mutex_unlock(&monc->mutex);
+	wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (e.g., statfs, poolop)
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+	struct ceph_mon_client *monc, u64 tid)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *n = monc->generic_request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mon_generic_request, node);
+		if (tid < req->tid)
+			n = n->rb_left;
+		else if (tid > req->tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+			    struct ceph_mon_generic_request *new)
+{
+	struct rb_node **p = &monc->generic_request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mon_generic_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mon_generic_request, node);
+		if (new->tid < req->tid)
+			p = &(*p)->rb_left;
+		else if (new->tid > req->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+	struct ceph_mon_generic_request *req =
+		container_of(kref, struct ceph_mon_generic_request, kref);
+
+	if (req->reply)
+		ceph_msg_put(req->reply);
+	if (req->request)
+		ceph_msg_put(req->request);
+
+	kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+					 struct ceph_msg_header *hdr,
+					 int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	struct ceph_mon_generic_request *req;
+	u64 tid = le64_to_cpu(hdr->tid);
+	struct ceph_msg *m;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (!req) {
+		dout("get_generic_reply %lld dne\n", tid);
+		*skip = 1;
+		m = NULL;
+	} else {
+		dout("get_generic_reply %lld got %p\n", tid, req->reply);
+		m = ceph_msg_get(req->reply);
+		/*
+		 * we don't need to track the connection reading into
+		 * this reply because we only have one open connection
+		 * at a time, ever.
+		 */
+	}
+	mutex_unlock(&monc->mutex);
+	return m;
+}
+
+static int do_generic_request(struct ceph_mon_client *monc,
+			      struct ceph_mon_generic_request *req)
+{
+	int err;
+
+	/* register request */
+	mutex_lock(&monc->mutex);
+	req->tid = ++monc->last_tid;
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	__insert_generic_request(monc, req);
+	monc->num_generic_requests++;
+	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	mutex_unlock(&monc->mutex);
+
+	err = wait_for_completion_interruptible(&req->completion);
+
+	mutex_lock(&monc->mutex);
+	rb_erase(&req->node, &monc->generic_request_tree);
+	monc->num_generic_requests--;
+	mutex_unlock(&monc->mutex);
+
+	if (!err)
+		err = req->result;
+	return err;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len != sizeof(*reply))
+		goto bad;
+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		*(struct ceph_statfs *)req->buf = reply->st;
+		req->result = 0;
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete_all(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	req->buf_len = sizeof(*buf);
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+
+	err = do_generic_request(monc, req);
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+/*
+ * pool ops
+ */
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+				char *dst, size_t dst_len)
+{
+	u32 buf_len;
+
+	if (src_len != sizeof(u32) + dst_len)
+		return -EINVAL;
+
+	buf_len = le32_to_cpu(*(u32 *)src);
+	if (buf_len != dst_len)
+		return -EINVAL;
+
+	memcpy(dst, src + sizeof(u32), dst_len);
+	return 0;
+}
+
+static void handle_poolop_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len < sizeof(*reply))
+		goto bad;
+	dout("handle_poolop_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		if (req->buf_len &&
+		    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+				     msg->front.iov_len - sizeof(*reply),
+				     req->buf, req->buf_len) < 0) {
+			mutex_unlock(&monc->mutex);
+			goto bad;
+		}
+		req->result = le32_to_cpu(reply->reply_code);
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+			u32 pool, u64 snapid,
+			char *buf, int len)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_poolop *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	req->buf_len = len;
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	req->request->hdr.version = cpu_to_le16(2);
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+	h->pool = cpu_to_le32(pool);
+	h->op = cpu_to_le32(op);
+	h->auid = 0;
+	h->snapid = cpu_to_le64(snapid);
+	h->name_len = 0;
+
+	err = do_generic_request(monc, req);
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+			    u32 pool, u64 *snapid)
+{
+	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+				   pool, 0, (char *)snapid, sizeof(*snapid));
+
+}
+EXPORT_SYMBOL(ceph_monc_create_snapid);
+
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+			    u32 pool, u64 snapid)
+{
+	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+				   pool, snapid, 0, 0);
+
+}
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *p;
+
+	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_generic_request, node);
+		ceph_con_revoke(monc->con, req->request);
+		ceph_con_send(monc->con, ceph_msg_get(req->request));
+	}
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client, delayed_work.work);
+
+	dout("monc delayed_work\n");
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		__close_session(monc);
+		__open_session(monc);  /* continue hunting */
+	} else {
+		ceph_con_keepalive(monc->con);
+
+		__validate_auth(monc);
+
+		if (monc->auth->ops->is_authenticated(monc->auth))
+			__send_subscribe(monc);
+	}
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+	struct ceph_options *opt = monc->client->options;
+	struct ceph_entity_addr *mon_addr = opt->mon_addr;
+	int num_mon = opt->num_mon;
+	int i;
+
+	/* build initial monmap */
+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!monc->monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		monc->monmap->mon_inst[i].addr = mon_addr[i];
+		monc->monmap->mon_inst[i].addr.nonce = 0;
+		monc->monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	monc->monmap->num_mon = num_mon;
+	monc->have_fsid = false;
+	return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+	int err = 0;
+
+	dout("init\n");
+	memset(monc, 0, sizeof(*monc));
+	monc->client = cl;
+	monc->monmap = NULL;
+	mutex_init(&monc->mutex);
+
+	err = build_initial_monmap(monc);
+	if (err)
+		goto out;
+
+	monc->con = NULL;
+
+	/* authentication */
+	monc->auth = ceph_auth_init(cl->options->name,
+				    cl->options->secret);
+	if (IS_ERR(monc->auth))
+		return PTR_ERR(monc->auth);
+	monc->auth->want_keys =
+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+	/* msgs */
+	err = -ENOMEM;
+	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     GFP_NOFS);
+	if (!monc->m_subscribe_ack)
+		goto out_monmap;
+
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	if (!monc->m_subscribe)
+		goto out_subscribe_ack;
+
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+	if (!monc->m_auth_reply)
+		goto out_subscribe;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+	monc->pending_auth = 0;
+	if (!monc->m_auth)
+		goto out_auth_reply;
+
+	monc->cur_mon = -1;
+	monc->hunting = true;
+	monc->sub_renew_after = jiffies;
+	monc->sub_sent = 0;
+
+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+	monc->generic_request_tree = RB_ROOT;
+	monc->num_generic_requests = 0;
+	monc->last_tid = 0;
+
+	monc->have_mdsmap = 0;
+	monc->have_osdmap = 0;
+	monc->want_next_osdmap = 1;
+	return 0;
+
+out_auth_reply:
+	ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+	ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+	ceph_msg_put(monc->m_subscribe_ack);
+out_monmap:
+	kfree(monc->monmap);
+out:
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&monc->delayed_work);
+
+	mutex_lock(&monc->mutex);
+	__close_session(monc);
+	if (monc->con) {
+		monc->con->private = NULL;
+		monc->con->ops->put(monc->con);
+		monc->con = NULL;
+	}
+	mutex_unlock(&monc->mutex);
+
+	ceph_auth_destroy(monc->auth);
+
+	ceph_msg_put(monc->m_auth);
+	ceph_msg_put(monc->m_auth_reply);
+	ceph_msg_put(monc->m_subscribe);
+	ceph_msg_put(monc->m_subscribe_ack);
+
+	kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	int ret;
+	int was_auth = 0;
+
+	mutex_lock(&monc->mutex);
+	if (monc->auth->ops)
+		was_auth = monc->auth->ops->is_authenticated(monc->auth);
+	monc->pending_auth = 0;
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_max);
+	if (ret < 0) {
+		monc->client->auth_err = ret;
+		wake_up_all(&monc->client->auth_wq);
+	} else if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+		dout("authenticated, starting session\n");
+
+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr->inst.name.num =
+					cpu_to_le64(monc->auth->global_id);
+
+		__send_subscribe(monc);
+		__resend_generic_request(monc);
+	}
+	mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	if (monc->pending_auth)
+		return 0;
+
+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+			      monc->m_auth->front_max);
+	if (ret <= 0)
+		return ret; /* either an error, or no need to authenticate */
+	__send_prepared_auth_request(monc, ret);
+	return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = __validate_auth(monc);
+	mutex_unlock(&monc->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!monc)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_AUTH_REPLY:
+		handle_auth_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		handle_subscribe_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_STATFS_REPLY:
+		handle_statfs_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_POOLOP_REPLY:
+		handle_poolop_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_MAP:
+		ceph_monc_handle_map(monc, msg);
+		break;
+
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(&monc->client->osdc, msg);
+		break;
+
+	default:
+		/* can the chained handler handle it? */
+		if (monc->client->extra_mon_dispatch &&
+		    monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+			break;
+			
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr,
+				      int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m = NULL;
+
+	*skip = 0;
+
+	switch (type) {
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		m = ceph_msg_get(monc->m_subscribe_ack);
+		break;
+	case CEPH_MSG_POOLOP_REPLY:
+	case CEPH_MSG_STATFS_REPLY:
+		return get_generic_reply(con, hdr, skip);
+	case CEPH_MSG_AUTH_REPLY:
+		m = ceph_msg_get(monc->m_auth_reply);
+		break;
+	case CEPH_MSG_MON_MAP:
+	case CEPH_MSG_MDS_MAP:
+	case CEPH_MSG_OSD_MAP:
+		m = ceph_msg_new(type, front_len, GFP_NOFS);
+		break;
+	}
+
+	if (!m) {
+		pr_info("alloc_msg unknown type %d\n", type);
+		*skip = 1;
+	}
+	return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+	struct ceph_mon_client *monc = con->private;
+
+	if (!monc)
+		return;
+
+	dout("mon_fault\n");
+	mutex_lock(&monc->mutex);
+	if (!con->private)
+		goto out;
+
+	if (monc->con && !monc->hunting)
+		pr_info("mon%d %s session lost, "
+			"hunting for new mon\n", monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+
+	__close_session(monc);
+	if (!monc->hunting) {
+		/* start hunting */
+		monc->hunting = true;
+		__open_session(monc);
+	} else {
+		/* already hunting, let's wait a bit */
+		__schedule_delayed(monc);
+	}
+out:
+	mutex_unlock(&monc->mutex);
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+	.get = ceph_con_get,
+	.put = ceph_con_put,
+	.dispatch = dispatch,
+	.fault = mon_fault,
+	.alloc_msg = mon_alloc_msg,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 000000000000..d5f2d97ac05c
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/msgpool.h>
+
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+	struct ceph_msgpool *pool = arg;
+	void *p;
+
+	p = ceph_msg_new(0, pool->front_len, gfp_mask);
+	if (!p)
+		pr_err("msgpool %s alloc failed\n", pool->name);
+	return p;
+}
+
+static void free_fn(void *element, void *arg)
+{
+	ceph_msg_put(element);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+		      int front_len, int size, bool blocking, const char *name)
+{
+	pool->front_len = front_len;
+	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+	if (!pool->pool)
+		return -ENOMEM;
+	pool->name = name;
+	return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+	mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+				  int front_len)
+{
+	if (front_len > pool->front_len) {
+		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+		       pool->name, front_len, pool->front_len);
+		WARN_ON(1);
+
+		/* try to alloc a fresh message */
+		return ceph_msg_new(0, front_len, GFP_NOFS);
+	}
+
+	return mempool_alloc(pool->pool, GFP_NOFS);
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+	/* reset msg front_len; user may have changed it */
+	msg->front.iov_len = pool->front_len;
+	msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+	kref_init(&msg->kref);  /* retake single ref */
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 000000000000..79391994b3ed
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+#define OSD_OP_FRONT_LEN	4096
+#define OSD_OPREPLY_FRONT_LEN	512
+
+static const struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+static int op_needs_trail(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_CALL:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int op_has_extent(int op)
+{
+	return (op == CEPH_OSD_OP_READ ||
+		op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+
+	reqhead->snapid = cpu_to_le64(snapid);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	if (op_has_extent(op->op)) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+			struct ceph_vino vino,
+			struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	u64 bno;
+
+	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+			     plen, &bno, req, op);
+
+	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+	req->r_oid_len = strlen(req->r_oid);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+	struct ceph_osd_request *req = container_of(kref,
+						    struct ceph_osd_request,
+						    r_kref);
+
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_con_filling_msg) {
+		dout("release_request revoking pages %p from con %p\n",
+		     req->r_pages, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg,
+				      req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+	}
+	if (req->r_own_pages)
+		ceph_release_page_vector(req->r_pages,
+					 req->r_num_pages);
+#ifdef CONFIG_BLOCK
+	if (req->r_bio)
+		bio_put(req->r_bio);
+#endif
+	ceph_put_snap_context(req->r_snapc);
+	if (req->r_trail) {
+		ceph_pagelist_release(req->r_trail);
+		kfree(req->r_trail);
+	}
+	if (req->r_mempool)
+		mempool_free(req, req->r_osdc->req_mempool);
+	else
+		kfree(req);
+}
+EXPORT_SYMBOL(ceph_osdc_release_request);
+
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+	int i = 0;
+
+	if (needs_trail)
+		*needs_trail = 0;
+	while (ops[i].op) {
+		if (needs_trail && op_needs_trail(ops[i].op))
+			*needs_trail = 1;
+		i++;
+	}
+
+	return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
+					       struct ceph_snap_context *snapc,
+					       struct ceph_osd_req_op *ops,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages,
+					       struct bio *bio)
+{
+	struct ceph_osd_request *req;
+	struct ceph_msg *msg;
+	int needs_trail;
+	int num_op = get_num_ops(ops, &needs_trail);
+	size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+	msg_size += num_op*sizeof(struct ceph_osd_op);
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, gfp_flags);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), gfp_flags);
+	}
+	if (req == NULL)
+		return NULL;
+
+	req->r_osdc = osdc;
+	req->r_mempool = use_mempool;
+
+	kref_init(&req->r_kref);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+	req->r_flags = flags;
+
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+	/* create reply message */
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+	req->r_reply = msg;
+
+	/* allocate space for the trailing data */
+	if (needs_trail) {
+		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+		if (!req->r_trail) {
+			ceph_osdc_put_request(req);
+			return NULL;
+		}
+		ceph_pagelist_init(req->r_trail);
+	}
+	/* create request message; allow space for oid */
+	msg_size += 40;
+	if (snapc)
+		msg_size += sizeof(u64) * snapc->num_snaps;
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+
+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+	memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+	req->r_request = msg;
+	req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+	if (bio) {
+		req->r_bio = bio;
+		bio_get(req->r_bio);
+	}
+#endif
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+			      struct ceph_osd_op *dst,
+			      struct ceph_osd_req_op *src)
+{
+	dst->op = cpu_to_le16(src->op);
+
+	switch (dst->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		dst->extent.offset =
+			cpu_to_le64(src->extent.offset);
+		dst->extent.length =
+			cpu_to_le64(src->extent.length);
+		dst->extent.truncate_size =
+			cpu_to_le64(src->extent.truncate_size);
+		dst->extent.truncate_seq =
+			cpu_to_le32(src->extent.truncate_seq);
+		break;
+
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+		BUG_ON(!req->r_trail);
+
+		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+		dst->xattr.cmp_op = src->xattr.cmp_op;
+		dst->xattr.cmp_mode = src->xattr.cmp_mode;
+		ceph_pagelist_append(req->r_trail, src->xattr.name,
+				     src->xattr.name_len);
+		ceph_pagelist_append(req->r_trail, src->xattr.val,
+				     src->xattr.value_len);
+		break;
+	case CEPH_OSD_OP_CALL:
+		BUG_ON(!req->r_trail);
+
+		dst->cls.class_len = src->cls.class_len;
+		dst->cls.method_len = src->cls.method_len;
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+				     src->cls.class_len);
+		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+				     src->cls.method_len);
+		ceph_pagelist_append(req->r_trail, src->cls.indata,
+				     src->cls.indata_len);
+		break;
+	case CEPH_OSD_OP_ROLLBACK:
+		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+		break;
+	case CEPH_OSD_OP_STARTSYNC:
+		break;
+	default:
+		pr_err("unrecognized osd opcode %d\n", dst->op);
+		WARN_ON(1);
+		break;
+	}
+	dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+			     u64 off, u64 *plen,
+			     struct ceph_osd_req_op *src_ops,
+			     struct ceph_snap_context *snapc,
+			     struct timespec *mtime,
+			     const char *oid,
+			     int oid_len)
+{
+	struct ceph_msg *msg = req->r_request;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_req_op *src_op;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = get_num_ops(src_ops, NULL);
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int flags = req->r_flags;
+	u64 data_len = 0;
+	int i;
+
+	head = msg->front.iov_base;
+	op = (void *)(head + 1);
+	p = (void *)(op + num_op);
+
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	head->client_inc = cpu_to_le32(1); /* always, for now. */
+	head->flags = cpu_to_le32(flags);
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(&head->mtime, mtime);
+	head->num_ops = cpu_to_le16(num_op);
+
+
+	/* fill in oid */
+	head->object_len = cpu_to_le32(oid_len);
+	memcpy(p, oid, oid_len);
+	p += oid_len;
+
+	src_op = src_ops;
+	while (src_op->op) {
+		osd_req_encode_op(req, op, src_op);
+		src_op++;
+		op++;
+	}
+
+	if (req->r_trail)
+		data_len += req->r_trail->length;
+
+	if (snapc) {
+		head->snap_seq = cpu_to_le64(snapc->seq);
+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+		for (i = 0; i < snapc->num_snaps; i++) {
+			put_unaligned_le64(snapc->snaps[i], p);
+			p += sizeof(u64);
+		}
+	}
+
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+	} else if (data_len) {
+		req->r_request->hdr.data_off = 0;
+		req->r_request->hdr.data_len = cpu_to_le32(data_len);
+	}
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_request *req;
+
+	ops[0].op = opcode;
+	ops[0].extent.truncate_seq = truncate_seq;
+	ops[0].extent.truncate_size = truncate_size;
+	ops[0].payload_len = 0;
+
+	if (do_sync) {
+		ops[1].op = CEPH_OSD_OP_STARTSYNC;
+		ops[1].payload_len = 0;
+		ops[2].op = 0;
+	} else
+		ops[1].op = 0;
+
+	req = ceph_osdc_alloc_request(osdc, flags,
+					 snapc, ops,
+					 use_mempool,
+					 GFP_NOFS, NULL, NULL);
+	if (IS_ERR(req))
+		return req;
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req, ops);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	ceph_osdc_build_request(req, off, plen, ops,
+				snapc,
+				mtime,
+				req->r_oid, req->r_oid_len);
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *new)
+{
+	struct rb_node **p = &osdc->requests.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_osd_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+						 u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+		    u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid) {
+			if (!n->rb_left)
+				return req;
+			n = n->rb_left;
+		} else if (tid > req->r_tid) {
+			n = n->rb_right;
+		} else {
+			return req;
+		}
+	}
+	return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+
+	if (!osd)
+		return;
+	dout("osd_reset osd%d\n", osd->o_osd);
+	osdc = osd->o_osdc;
+	down_read(&osdc->map_sem);
+	kick_requests(osdc, osd);
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd *osd;
+
+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
+	if (!osd)
+		return NULL;
+
+	atomic_set(&osd->o_ref, 1);
+	osd->o_osdc = osdc;
+	INIT_LIST_HEAD(&osd->o_requests);
+	INIT_LIST_HEAD(&osd->o_osd_lru);
+	osd->o_incarnation = 1;
+
+	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	osd->o_con.private = osd;
+	osd->o_con.ops = &osd_con_ops;
+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
+	return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+	if (atomic_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+		     atomic_read(&osd->o_ref));
+		return osd;
+	} else {
+		dout("get_osd %p FAIL\n", osd);
+		return NULL;
+	}
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+	     atomic_read(&osd->o_ref) - 1);
+	if (atomic_dec_and_test(&osd->o_ref)) {
+		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+		if (osd->o_authorizer)
+			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+		kfree(osd);
+	}
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("__remove_osd %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_requests));
+	rb_erase(&osd->o_node, &osdc->osds);
+	list_del_init(&osd->o_osd_lru);
+	ceph_con_close(&osd->o_con);
+	put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+			      struct ceph_osd *osd)
+{
+	dout("__move_osd_to_lru %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_osd_lru));
+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+	dout("__remove_osd_from_lru %p\n", osd);
+	if (!list_empty(&osd->o_osd_lru))
+		list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+	struct ceph_osd *osd, *nosd;
+
+	dout("__remove_old_osds %p\n", osdc);
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
+			break;
+		__remove_osd(osdc, osd);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	struct ceph_osd_request *req;
+	int ret = 0;
+
+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+	if (list_empty(&osd->o_requests)) {
+		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
+	} else {
+		ceph_con_close(&osd->o_con);
+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		osd->o_incarnation++;
+	}
+	return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+	struct rb_node **p = &osdc->osds.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd *osd = NULL;
+
+	while (*p) {
+		parent = *p;
+		osd = rb_entry(parent, struct ceph_osd, o_node);
+		if (new->o_osd < osd->o_osd)
+			p = &(*p)->rb_left;
+		else if (new->o_osd > osd->o_osd)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->o_node, parent, p);
+	rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+	struct ceph_osd *osd;
+	struct rb_node *n = osdc->osds.rb_node;
+
+	while (n) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		if (o < osd->o_osd)
+			n = n->rb_left;
+		else if (o > osd->o_osd)
+			n = n->rb_right;
+		else
+			return osd;
+	}
+	return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+	schedule_delayed_work(&osdc->timeout_work,
+			osdc->client->options->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
+{
+	mutex_lock(&osdc->request_mutex);
+	req->r_tid = ++osdc->last_tid;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	INIT_LIST_HEAD(&req->r_req_lru_item);
+
+	dout("register_request %p tid %lld\n", req, req->r_tid);
+	__insert_request(osdc, req);
+	ceph_osdc_get_request(req);
+	osdc->num_requests++;
+
+	if (osdc->num_requests == 1) {
+		dout(" first request, scheduling timeout\n");
+		__schedule_osd_timeout(osdc);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+				 struct ceph_osd_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &osdc->requests);
+	osdc->num_requests--;
+
+	if (req->r_osd) {
+		/* make sure the original request isn't in flight. */
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests))
+			__move_osd_to_lru(osdc, req->r_osd);
+		req->r_osd = NULL;
+	}
+
+	ceph_osdc_put_request(req);
+
+	list_del_init(&req->r_req_lru_item);
+	if (osdc->num_requests == 0) {
+		dout(" no requests, canceling timeout\n");
+		__cancel_osd_timeout(osdc);
+	}
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+	if (req->r_sent && req->r_osd) {
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		req->r_sent = 0;
+	}
+	list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+		      struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_pg pgid;
+	int acting[CEPH_PG_MAX_SIZE];
+	int o = -1, num = 0;
+	int err;
+
+	dout("map_osds %p tid %lld\n", req, req->r_tid);
+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+				      &req->r_file_layout, osdc->osdmap);
+	if (err)
+		return err;
+	pgid = reqhead->layout.ol_pgid;
+	req->r_pgid = pgid;
+
+	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+	if (err > 0) {
+		o = acting[0];
+		num = err;
+	}
+
+	if ((req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_sent >= req->r_osd->o_incarnation &&
+	     req->r_num_pg_osds == num &&
+	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+	    (req->r_osd == NULL && o == -1))
+		return 0;  /* no change */
+
+	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	     req->r_osd ? req->r_osd->o_osd : -1);
+
+	/* record full pg acting set */
+	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+	req->r_num_pg_osds = num;
+
+	if (req->r_osd) {
+		__cancel_request(req);
+		list_del_init(&req->r_osd_item);
+		req->r_osd = NULL;
+	}
+
+	req->r_osd = __lookup_osd(osdc, o);
+	if (!req->r_osd && o >= 0) {
+		err = -ENOMEM;
+		req->r_osd = create_osd(osdc);
+		if (!req->r_osd)
+			goto out;
+
+		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+		req->r_osd->o_osd = o;
+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+		__insert_osd(osdc, req->r_osd);
+
+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+	}
+
+	if (req->r_osd) {
+		__remove_osd_from_lru(req->r_osd);
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+	}
+	err = 1;   /* osd or pg changed */
+
+out:
+	return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead;
+	int err;
+
+	err = __map_osds(osdc, req);
+	if (err < 0)
+		return err;
+	if (req->r_osd == NULL) {
+		dout("send_request %p no up osds in pg\n", req);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+		return 0;
+	}
+
+	dout("send_request %p tid %llu to osd%d flags %d\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+	reqhead->reassert_version = req->r_reassert_version;
+
+	req->r_stamp = jiffies;
+	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+
+	ceph_msg_get(req->r_request); /* send consumes a ref */
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	req->r_sent = req->r_osd->o_incarnation;
+	return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd *osd;
+	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
+	unsigned long keepalive =
+		osdc->client->options->osd_keepalive_timeout * HZ;
+	unsigned long last_stamp = 0;
+	struct rb_node *p;
+	struct list_head slow_osds;
+
+	dout("timeout\n");
+	down_read(&osdc->map_sem);
+
+	ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			int err;
+
+			dout("osdc resending prev failed %lld\n", req->r_tid);
+			err = __send_request(osdc, req);
+			if (err)
+				dout("osdc failed again on %lld\n", req->r_tid);
+			else
+				req->r_resend = false;
+			continue;
+		}
+	}
+
+	/*
+	 * reset osds that appear to be _really_ unresponsive.  this
+	 * is a failsafe measure.. we really shouldn't be getting to
+	 * this point if the system is working properly.  the monitors
+	 * should mark the osd as failed and we should find out about
+	 * it from an updated osd map.
+	 */
+	while (timeout && !list_empty(&osdc->req_lru)) {
+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+				 r_req_lru_item);
+
+		if (time_before(jiffies, req->r_stamp + timeout))
+			break;
+
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
+		last_req = req;
+		last_stamp = req->r_stamp;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+			   req->r_tid, osd->o_osd);
+		__kick_requests(osdc, osd);
+	}
+
+	/*
+	 * ping osds that are a bit slow.  this ensures that if there
+	 * is a break in the TCP connection we will notice, and reopen
+	 * a connection with that osd (from the fault callback).
+	 */
+	INIT_LIST_HEAD(&slow_osds);
+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+		if (time_before(jiffies, req->r_stamp + keepalive))
+			break;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
+		     req->r_tid, osd->o_osd);
+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+	}
+	while (!list_empty(&slow_osds)) {
+		osd = list_entry(slow_osds.next, struct ceph_osd,
+				 o_keepalive_item);
+		list_del_init(&osd->o_keepalive_item);
+		ceph_con_keepalive(&osd->o_con);
+	}
+
+	__schedule_osd_timeout(osdc);
+	mutex_unlock(&osdc->request_mutex);
+
+	up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osds_timeout_work.work);
+	unsigned long delay =
+		osdc->client->options->osd_idle_ttl * HZ >> 2;
+
+	dout("osds timeout\n");
+	down_read(&osdc->map_sem);
+	remove_old_osds(osdc, 0);
+	up_read(&osdc->map_sem);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+			      round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+			 struct ceph_connection *con)
+{
+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int numops, object_len, flags;
+	s32 result;
+
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*rhead))
+		goto bad;
+	numops = le32_to_cpu(rhead->num_ops);
+	object_len = le32_to_cpu(rhead->object_len);
+	result = le32_to_cpu(rhead->result);
+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
+	    numops * sizeof(struct ceph_osd_op))
+		goto bad;
+	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+	/* lookup */
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (req == NULL) {
+		dout("handle_reply tid %llu dne\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		return;
+	}
+	ceph_osdc_get_request(req);
+	flags = le32_to_cpu(rhead->flags);
+
+	/*
+	 * if this connection filled our message, drop our reference now, to
+	 * avoid a (safe but slower) revoke later.
+	 */
+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
+		dout(" dropping con_filling_msg ref %p\n", con);
+		req->r_con_filling_msg = NULL;
+		ceph_con_put(con);
+	}
+
+	if (!req->r_got_reply) {
+		unsigned bytes;
+
+		req->r_result = le32_to_cpu(rhead->result);
+		bytes = le32_to_cpu(msg->hdr.data_len);
+		dout("handle_reply result %d bytes %d\n", req->r_result,
+		     bytes);
+		if (req->r_result == 0)
+			req->r_result = bytes;
+
+		/* in case this is a write and we need to replay, */
+		req->r_reassert_version = rhead->reassert_version;
+
+		req->r_got_reply = 1;
+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+		dout("handle_reply tid %llu dup ack\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		goto done;
+	}
+
+	dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+	/* either this is a read, or we got the safe response */
+	if (result < 0 ||
+	    (flags & CEPH_OSD_FLAG_ONDISK) ||
+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+		__unregister_request(osdc, req);
+
+	mutex_unlock(&osdc->request_mutex);
+
+	if (req->r_callback)
+		req->r_callback(req, msg);
+	else
+		complete_all(&req->r_completion);
+
+	if (flags & CEPH_OSD_FLAG_ONDISK) {
+		if (req->r_safe_callback)
+			req->r_safe_callback(req, msg);
+		complete_all(&req->r_safe_completion);  /* fsync waiter */
+	}
+
+done:
+	ceph_osdc_put_request(req);
+	return;
+
+bad:
+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+	       (int)sizeof(*rhead));
+	ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *p, *n;
+	int needmap = 0;
+	int err;
+
+	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+	if (kickosd) {
+		err = __reset_osd(osdc, kickosd);
+		if (err == -EAGAIN)
+			return 1;
+	} else {
+		for (p = rb_first(&osdc->osds); p; p = n) {
+			struct ceph_osd *osd =
+				rb_entry(p, struct ceph_osd, o_node);
+
+			n = rb_next(p);
+			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+			    memcmp(&osd->o_con.peer_addr,
+				   ceph_osd_addr(osdc->osdmap,
+						 osd->o_osd),
+				   sizeof(struct ceph_entity_addr)) != 0)
+				__reset_osd(osdc, osd);
+		}
+	}
+
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			dout(" r_resend set on tid %llu\n", req->r_tid);
+			__cancel_request(req);
+			goto kick;
+		}
+		if (req->r_osd && kickosd == req->r_osd) {
+			__cancel_request(req);
+			goto kick;
+		}
+
+		err = __map_osds(osdc, req);
+		if (err == 0)
+			continue;  /* no change */
+		if (err < 0) {
+			/*
+			 * FIXME: really, we should set the request
+			 * error and fail if this isn't a 'nofail'
+			 * request, but that's a fair bit more
+			 * complicated to do.  So retry!
+			 */
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+			continue;
+		}
+		if (req->r_osd == NULL) {
+			dout("tid %llu maps to no valid osd\n", req->r_tid);
+			needmap++;  /* request a newer map */
+			continue;
+		}
+
+kick:
+		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+		     req->r_osd ? req->r_osd->o_osd : -1);
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+		err = __send_request(osdc, req);
+		if (err) {
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+		}
+	}
+
+	return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	int needmap;
+
+	mutex_lock(&osdc->request_mutex);
+	needmap = __kick_requests(osdc, kickosd);
+	mutex_unlock(&osdc->request_mutex);
+
+	if (needmap) {
+		dout("%d requests for down osds, need new map\n", needmap);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	}
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end, *next;
+	u32 nr_maps, maplen;
+	u32 epoch;
+	struct ceph_osdmap *newmap = NULL, *oldmap;
+	int err;
+	struct ceph_fsid fsid;
+
+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	/* verify fsid */
+	ceph_decode_need(&p, end, sizeof(fsid), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
+		return;
+
+	down_write(&osdc->map_sem);
+
+	/* incremental maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d inc maps\n", nr_maps);
+	while (nr_maps > 0) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		next = p + maplen;
+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+			dout("applying incremental map %u len %d\n",
+			     epoch, maplen);
+			newmap = osdmap_apply_incremental(&p, next,
+							  osdc->osdmap,
+							  osdc->client->msgr);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			if (newmap != osdc->osdmap) {
+				ceph_osdmap_destroy(osdc->osdmap);
+				osdc->osdmap = newmap;
+			}
+		} else {
+			dout("ignoring incremental map %u len %d\n",
+			     epoch, maplen);
+		}
+		p = next;
+		nr_maps--;
+	}
+	if (newmap)
+		goto done;
+
+	/* full maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d full maps\n", nr_maps);
+	while (nr_maps) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		if (nr_maps > 1) {
+			dout("skipping non-latest full map %u len %d\n",
+			     epoch, maplen);
+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+			dout("skipping full map %u len %d, "
+			     "older than our %u\n", epoch, maplen,
+			     osdc->osdmap->epoch);
+		} else {
+			dout("taking full map %u len %d\n", epoch, maplen);
+			newmap = osdmap_decode(&p, p+maplen);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			oldmap = osdc->osdmap;
+			osdc->osdmap = newmap;
+			if (oldmap)
+				ceph_osdmap_destroy(oldmap);
+		}
+		p += maplen;
+		nr_maps--;
+	}
+
+done:
+	downgrade_write(&osdc->map_sem);
+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+	if (newmap)
+		kick_requests(osdc, NULL);
+	up_read(&osdc->map_sem);
+	wake_up_all(&osdc->client->auth_wq);
+	return;
+
+bad:
+	pr_err("osdc handle_map corrupt msg\n");
+	ceph_msg_dump(msg);
+	up_write(&osdc->map_sem);
+	return;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
+{
+	int rc = 0;
+
+	req->r_request->pages = req->r_pages;
+	req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+	req->r_request->bio = req->r_bio;
+#endif
+	req->r_request->trail = req->r_trail;
+
+	register_request(osdc, req);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	/*
+	 * a racing kick_requests() may have sent the message for us
+	 * while we dropped request_mutex above, so only send now if
+	 * the request still han't been touched yet.
+	 */
+	if (req->r_sent == 0) {
+		rc = __send_request(osdc, req);
+		if (rc) {
+			if (nofail) {
+				dout("osdc_start_request failed send, "
+				     " marking %lld\n", req->r_tid);
+				req->r_resend = true;
+				rc = 0;
+			} else {
+				__unregister_request(osdc, req);
+			}
+		}
+	}
+	mutex_unlock(&osdc->request_mutex);
+	up_read(&osdc->map_sem);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	int rc;
+
+	rc = wait_for_completion_interruptible(&req->r_completion);
+	if (rc < 0) {
+		mutex_lock(&osdc->request_mutex);
+		__cancel_request(req);
+		__unregister_request(osdc, req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+		return rc;
+	}
+
+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	return req->r_result;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	u64 last_tid, next_tid = 0;
+
+	mutex_lock(&osdc->request_mutex);
+	last_tid = osdc->last_tid;
+	while (1) {
+		req = __lookup_request_ge(osdc, next_tid);
+		if (!req)
+			break;
+		if (req->r_tid > last_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+			continue;
+
+		ceph_osdc_get_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("sync waiting on tid %llu (last is %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&osdc->request_mutex);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+	dout("sync done (thru tid %llu)\n", last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+	int err;
+
+	dout("init\n");
+	osdc->client = client;
+	osdc->osdmap = NULL;
+	init_rwsem(&osdc->map_sem);
+	init_completion(&osdc->map_waiters);
+	osdc->last_requested_map = 0;
+	mutex_init(&osdc->request_mutex);
+	osdc->last_tid = 0;
+	osdc->osds = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->osd_lru);
+	osdc->requests = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->req_lru);
+	osdc->num_requests = 0;
+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+
+	err = -ENOMEM;
+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
+					sizeof(struct ceph_osd_request));
+	if (!osdc->req_mempool)
+		goto out;
+
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+				"osd_op");
+	if (err < 0)
+		goto out_mempool;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+				OSD_OPREPLY_FRONT_LEN, 10, true,
+				"osd_op_reply");
+	if (err < 0)
+		goto out_msgpool;
+	return 0;
+
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+	mempool_destroy(osdc->req_mempool);
+out:
+	return err;
+}
+EXPORT_SYMBOL(ceph_osdc_init);
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work_sync(&osdc->timeout_work);
+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
+	if (osdc->osdmap) {
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = NULL;
+	}
+	remove_old_osds(osdc, 1);
+	mempool_destroy(osdc->req_mempool);
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+EXPORT_SYMBOL(ceph_osdc_stop);
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			u32 truncate_seq, u64 truncate_size,
+			struct page **pages, int num_pages)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+	     vino.snap, off, *plen);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    false, 1);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short read due to an object boundary */
+	req->r_pages = pages;
+
+	dout("readpages  final extent is %llu~%llu (%d pages)\n",
+	     off, *plen, req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, false);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	dout("readpages result %d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_readpages);
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+			 struct ceph_file_layout *layout,
+			 struct ceph_snap_context *snapc,
+			 u64 off, u64 len,
+			 u32 truncate_seq, u64 truncate_size,
+			 struct timespec *mtime,
+			 struct page **pages, int num_pages,
+			 int flags, int do_sync, bool nofail)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	BUG_ON(vino.snap != CEPH_NOSNAP);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+				    CEPH_OSD_OP_WRITE,
+				    flags | CEPH_OSD_FLAG_ONDISK |
+					    CEPH_OSD_FLAG_WRITE,
+				    snapc, do_sync,
+				    truncate_seq, truncate_size, mtime,
+				    nofail, 1);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short write due to an object boundary */
+	req->r_pages = pages;
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+	     req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, nofail);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	if (rc == 0)
+		rc = len;
+	dout("writepages result %d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_writepages);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!osd)
+		goto out;
+	osdc = osd->o_osdc;
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(osdc, msg);
+		break;
+	case CEPH_MSG_OSD_OPREPLY:
+		handle_reply(osdc, msg, con);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_msg *m;
+	struct ceph_osd_request *req;
+	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
+	u64 tid;
+
+	tid = le64_to_cpu(hdr->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		*skip = 1;
+		m = NULL;
+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+			osd->o_osd);
+		goto out;
+	}
+
+	if (req->r_con_filling_msg) {
+		dout("get_reply revoking msg %p from old con %p\n",
+		     req->r_reply, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
+	}
+
+	if (front > req->r_reply->front.iov_len) {
+		pr_warning("get_reply front %d > preallocated %d\n",
+			   front, (int)req->r_reply->front.iov_len);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+		if (!m)
+			goto out;
+		ceph_msg_put(req->r_reply);
+		req->r_reply = m;
+	}
+	m = ceph_msg_get(req->r_reply);
+
+	if (data_len > 0) {
+		unsigned data_off = le16_to_cpu(hdr->data_off);
+		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+		if (unlikely(req->r_num_pages < want)) {
+			pr_warning("tid %lld reply %d > expected %d pages\n",
+				   tid, want, m->nr_pages);
+			*skip = 1;
+			ceph_msg_put(m);
+			m = NULL;
+			goto out;
+		}
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+		m->bio = req->r_bio;
+#endif
+	}
+	*skip = 0;
+	req->r_con_filling_msg = ceph_con_get(con);
+	dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+	mutex_unlock(&osdc->request_mutex);
+	return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		return ceph_msg_new(type, front, GFP_NOFS);
+	case CEPH_MSG_OSD_OPREPLY:
+		return get_reply(con, hdr, skip);
+	default:
+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+			osd->o_osd);
+		*skip = 1;
+		return NULL;
+	}
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	if (get_osd(osd))
+		return con;
+	return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && o->o_authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
+		o->o_authorizer = NULL;
+	}
+	if (o->o_authorizer == NULL) {
+		ret = ac->ops->create_authorizer(
+			ac, CEPH_ENTITY_TYPE_OSD,
+			&o->o_authorizer,
+			&o->o_authorizer_buf,
+			&o->o_authorizer_buf_len,
+			&o->o_authorizer_reply_buf,
+			&o->o_authorizer_reply_buf_len);
+		if (ret)
+			return ret;
+	}
+
+	*proto = ac->protocol;
+	*buf = o->o_authorizer_buf;
+	*len = o->o_authorizer_buf_len;
+	*reply_buf = o->o_authorizer_reply_buf;
+	*reply_len = o->o_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+	return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+	.get = get_osd_con,
+	.put = put_osd_con,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.alloc_msg = alloc_msg,
+	.fault = osd_reset,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 000000000000..d73f3f6efa36
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+	int flag = 0;
+
+	if (!len)
+		goto done;
+
+	*str = '\0';
+	if (state) {
+		if (state & CEPH_OSD_EXISTS) {
+			snprintf(str, len, "exists");
+			flag = 1;
+		}
+		if (state & CEPH_OSD_UP) {
+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+				 "up");
+			flag = 1;
+		}
+	} else {
+		snprintf(str, len, "doesn't exist");
+	}
+done:
+	return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+	int b = 0;
+	while (t) {
+		t = t >> 1;
+		b++;
+	}
+	return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+	pi->pgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+	pi->lpg_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+	pi->lpgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+				       struct crush_bucket_uniform *b)
+{
+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+	b->item_weight = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+				    struct crush_bucket_list *b)
+{
+	int j;
+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->sum_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->sum_weights[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+				    struct crush_bucket_tree *b)
+{
+	int j;
+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+	if (b->node_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+	for (j = 0; j < b->num_nodes; j++)
+		b->node_weights[j] = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+				     struct crush_bucket_straw *b)
+{
+	int j;
+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->straws == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->straws[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+	struct crush_map *c;
+	int err = -EINVAL;
+	int i, j;
+	void **p = &pbyval;
+	void *start = pbyval;
+	u32 magic;
+
+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	c = kzalloc(sizeof(*c), GFP_NOFS);
+	if (c == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
+	magic = ceph_decode_32(p);
+	if (magic != CRUSH_MAGIC) {
+		pr_err("crush_decode magic %x != current %x\n",
+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+		goto bad;
+	}
+	c->max_buckets = ceph_decode_32(p);
+	c->max_rules = ceph_decode_32(p);
+	c->max_devices = ceph_decode_32(p);
+
+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+	if (c->device_parents == NULL)
+		goto badmem;
+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+	if (c->bucket_parents == NULL)
+		goto badmem;
+
+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+	if (c->buckets == NULL)
+		goto badmem;
+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+	if (c->rules == NULL)
+		goto badmem;
+
+	/* buckets */
+	for (i = 0; i < c->max_buckets; i++) {
+		int size = 0;
+		u32 alg;
+		struct crush_bucket *b;
+
+		ceph_decode_32_safe(p, end, alg, bad);
+		if (alg == 0) {
+			c->buckets[i] = NULL;
+			continue;
+		}
+		dout("crush_decode bucket %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		switch (alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			size = sizeof(struct crush_bucket_uniform);
+			break;
+		case CRUSH_BUCKET_LIST:
+			size = sizeof(struct crush_bucket_list);
+			break;
+		case CRUSH_BUCKET_TREE:
+			size = sizeof(struct crush_bucket_tree);
+			break;
+		case CRUSH_BUCKET_STRAW:
+			size = sizeof(struct crush_bucket_straw);
+			break;
+		default:
+			err = -EINVAL;
+			goto bad;
+		}
+		BUG_ON(size == 0);
+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+		if (b == NULL)
+			goto badmem;
+
+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
+		b->id = ceph_decode_32(p);
+		b->type = ceph_decode_16(p);
+		b->alg = ceph_decode_8(p);
+		b->hash = ceph_decode_8(p);
+		b->weight = ceph_decode_32(p);
+		b->size = ceph_decode_32(p);
+
+		dout("crush_decode bucket size %d off %x %p to %p\n",
+		     b->size, (int)(*p-start), *p, end);
+
+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+		if (b->items == NULL)
+			goto badmem;
+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+		if (b->perm == NULL)
+			goto badmem;
+		b->perm_n = 0;
+
+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+		for (j = 0; j < b->size; j++)
+			b->items[j] = ceph_decode_32(p);
+
+		switch (b->alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			err = crush_decode_uniform_bucket(p, end,
+				  (struct crush_bucket_uniform *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_LIST:
+			err = crush_decode_list_bucket(p, end,
+			       (struct crush_bucket_list *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_TREE:
+			err = crush_decode_tree_bucket(p, end,
+				(struct crush_bucket_tree *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_STRAW:
+			err = crush_decode_straw_bucket(p, end,
+				(struct crush_bucket_straw *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		}
+	}
+
+	/* rules */
+	dout("rule vec is %p\n", c->rules);
+	for (i = 0; i < c->max_rules; i++) {
+		u32 yes;
+		struct crush_rule *r;
+
+		ceph_decode_32_safe(p, end, yes, bad);
+		if (!yes) {
+			dout("crush_decode NO rule %d off %x %p to %p\n",
+			     i, (int)(*p-start), *p, end);
+			c->rules[i] = NULL;
+			continue;
+		}
+
+		dout("crush_decode rule %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		/* len */
+		ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+		err = -EINVAL;
+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+			goto bad;
+#endif
+		r = c->rules[i] = kmalloc(sizeof(*r) +
+					  yes*sizeof(struct crush_rule_step),
+					  GFP_NOFS);
+		if (r == NULL)
+			goto badmem;
+		dout(" rule %d is at %p\n", i, r);
+		r->len = yes;
+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+		for (j = 0; j < r->len; j++) {
+			r->steps[j].op = ceph_decode_32(p);
+			r->steps[j].arg1 = ceph_decode_32(p);
+			r->steps[j].arg2 = ceph_decode_32(p);
+		}
+	}
+
+	/* ignore trailing name maps. */
+
+	dout("crush_decode success\n");
+	return c;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	dout("crush_decode fail %d\n", err);
+	crush_destroy(c);
+	return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+	u64 a = *(u64 *)&l;
+	u64 b = *(u64 *)&r;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+			       struct rb_root *root)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_mapping *pg = NULL;
+	int c;
+
+	while (*p) {
+		parent = *p;
+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
+		c = pgid_cmp(new->pgid, pg->pgid);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+						   struct ceph_pg pgid)
+{
+	struct rb_node *n = root->rb_node;
+	struct ceph_pg_mapping *pg;
+	int c;
+
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return pg;
+	}
+	return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_pool_info *pi = NULL;
+
+	while (*p) {
+		parent = *p;
+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+		if (new->id < pi->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pi->id)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+	struct ceph_pg_pool_info *pi;
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
+		if (id < pi->id)
+			n = n->rb_left;
+		else if (id > pi->id)
+			n = n->rb_right;
+		else
+			return pi;
+	}
+	return NULL;
+}
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+	struct rb_node *rbp;
+
+	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rbp, struct ceph_pg_pool_info, node);
+		if (pi->name && strcmp(pi->name, name) == 0)
+			return pi->id;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+	rb_erase(&pi->node, root);
+	kfree(pi->name);
+	kfree(pi);
+}
+
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+	unsigned n, m;
+
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+
+	/* num_snaps * snap_info_t */
+	n = le32_to_cpu(pi->v.num_snaps);
+	while (n--) {
+		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+				 sizeof(struct ceph_timespec), bad);
+		*p += sizeof(u64) +       /* key */
+			1 + sizeof(u64) + /* u8, snapid */
+			sizeof(struct ceph_timespec);
+		m = ceph_decode_32(p);    /* snap name */
+		*p += m;
+	}
+
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+	struct ceph_pg_pool_info *pi;
+	u32 num, len, pool;
+
+	ceph_decode_32_safe(p, end, num, bad);
+	dout(" %d pool names\n", num);
+	while (num--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_32_safe(p, end, len, bad);
+		dout("  pool %d len %d\n", pool, len);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			kfree(pi->name);
+			pi->name = kmalloc(len + 1, GFP_NOFS);
+			if (pi->name) {
+				memcpy(pi->name, *p, len);
+				pi->name[len] = '\0';
+				dout("  name is %s\n", pi->name);
+			}
+		}
+		*p += len;
+	}
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		__remove_pg_pool(&map->pg_pools, pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	u16 version;
+	u32 len, max, i;
+	u8 ev;
+	int err = -EINVAL;
+	void *start = *p;
+	struct ceph_pg_pool_info *pi;
+
+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (map == NULL)
+		return ERR_PTR(-ENOMEM);
+	map->pg_temp = RB_ROOT;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_VERSION) {
+		pr_warning("got unknown v %d > %d of osdmap\n", version,
+			   CEPH_OSDMAP_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+	map->epoch = ceph_decode_32(p);
+	ceph_decode_copy(p, &map->created, sizeof(map->created));
+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+	ceph_decode_32_safe(p, end, max, bad);
+	while (max--) {
+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		pi = kzalloc(sizeof(*pi), GFP_NOFS);
+		if (!pi)
+			goto bad;
+		pi->id = ceph_decode_32(p);
+		ev = ceph_decode_8(p); /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			kfree(pi);
+			goto bad;
+		}
+		err = __decode_pool(p, end, pi);
+		if (err < 0)
+			goto bad;
+		__insert_pg_pool(&map->pg_pools, pi);
+	}
+
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+	ceph_decode_32_safe(p, end, map->flags, bad);
+
+	max = ceph_decode_32(p);
+
+	/* (re)alloc osd arrays */
+	err = osdmap_set_max_osd(map, max);
+	if (err < 0)
+		goto bad;
+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+	/* osds */
+	err = -EINVAL;
+	ceph_decode_need(p, end, 3*sizeof(u32) +
+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+				       sizeof(*map->osd_addr)), bad);
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+	*p += 4; /* skip length field (should match max) */
+	for (i = 0; i < map->max_osd; i++)
+		map->osd_weight[i] = ceph_decode_32(p);
+
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_addr(&map->osd_addr[i]);
+
+	/* pg_temp */
+	ceph_decode_32_safe(p, end, len, bad);
+	for (i = 0; i < len; i++) {
+		int n, j;
+		struct ceph_pg pgid;
+		struct ceph_pg_mapping *pg;
+
+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		n = ceph_decode_32(p);
+		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		err = -ENOMEM;
+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+		if (!pg)
+			goto bad;
+		pg->pgid = pgid;
+		pg->len = n;
+		for (j = 0; j < n; j++)
+			pg->osds[j] = ceph_decode_32(p);
+
+		err = __insert_pg_mapping(pg, &map->pg_temp);
+		if (err)
+			goto bad;
+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+	}
+
+	/* crush */
+	ceph_decode_32_safe(p, end, len, bad);
+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
+	     (int)(*p - start));
+	ceph_decode_need(p, end, len, bad);
+	map->crush = crush_decode(*p, end);
+	*p += len;
+	if (IS_ERR(map->crush)) {
+		err = PTR_ERR(map->crush);
+		map->crush = NULL;
+		goto bad;
+	}
+
+	/* ignore the rest of the map */
+	*p = end;
+
+	dout("osdmap_decode done %p %p\n", *p, end);
+	return map;
+
+bad:
+	dout("osdmap_decode fail\n");
+	ceph_osdmap_destroy(map);
+	return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map,
+					     struct ceph_messenger *msgr)
+{
+	struct crush_map *newcrush = NULL;
+	struct ceph_fsid fsid;
+	u32 epoch = 0;
+	struct ceph_timespec modified;
+	u32 len, pool;
+	__s32 new_pool_max, new_flags, max;
+	void *start = *p;
+	int err = -EINVAL;
+	u16 version;
+	struct rb_node *rbp;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_INC_VERSION) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+			   CEPH_OSDMAP_INC_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+			 bad);
+	ceph_decode_copy(p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(p);
+	BUG_ON(epoch != map->epoch+1);
+	ceph_decode_copy(p, &modified, sizeof(modified));
+	new_pool_max = ceph_decode_32(p);
+	new_flags = ceph_decode_32(p);
+
+	/* full map? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental full map len %d, %p to %p\n",
+		     len, *p, end);
+		return osdmap_decode(p, min(*p+len, end));
+	}
+
+	/* new crush? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental new crush map len %d, %p to %p\n",
+		     len, *p, end);
+		newcrush = crush_decode(*p, min(*p+len, end));
+		if (IS_ERR(newcrush))
+			return ERR_CAST(newcrush);
+		*p += len;
+	}
+
+	/* new flags? */
+	if (new_flags >= 0)
+		map->flags = new_flags;
+	if (new_pool_max >= 0)
+		map->pool_max = new_pool_max;
+
+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+	/* new max? */
+	max = ceph_decode_32(p);
+	if (max >= 0) {
+		err = osdmap_set_max_osd(map, max);
+		if (err < 0)
+			goto bad;
+	}
+
+	map->epoch++;
+	map->modified = map->modified;
+	if (newcrush) {
+		if (map->crush)
+			crush_destroy(map->crush);
+		map->crush = newcrush;
+		newcrush = NULL;
+	}
+
+	/* new_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		__u8 ev;
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+		ev = ceph_decode_8(p);  /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!pi) {
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pi->id = pool;
+			__insert_pg_pool(&map->pg_pools, pi);
+		}
+		err = __decode_pool(p, end, pi);
+		if (err < 0)
+			goto bad;
+	}
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	/* old_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi)
+			__remove_pg_pool(&map->pg_pools, pi);
+	}
+
+	/* new_up */
+	err = -EINVAL;
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		struct ceph_entity_addr addr;
+		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_addr(&addr);
+		pr_info("osd%d up\n", osd);
+		BUG_ON(osd >= map->max_osd);
+		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	/* new_down */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		ceph_decode_32_safe(p, end, osd, bad);
+		(*p)++;  /* clean flag */
+		pr_info("osd%d down\n", osd);
+		if (osd < map->max_osd)
+			map->osd_state[osd] &= ~CEPH_OSD_UP;
+	}
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd, off;
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		osd = ceph_decode_32(p);
+		off = ceph_decode_32(p);
+		pr_info("osd%d weight 0x%x %s\n", osd, off,
+		     off == CEPH_OSD_IN ? "(in)" :
+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
+		if (osd < map->max_osd)
+			map->osd_weight[osd] = off;
+	}
+
+	/* new_pg_temp */
+	rbp = rb_first(&map->pg_temp);
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_mapping *pg;
+		int j;
+		struct ceph_pg pgid;
+		u32 pglen;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		pglen = ceph_decode_32(p);
+
+		/* remove any? */
+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+						node)->pgid, pgid) <= 0) {
+			struct ceph_pg_mapping *cur =
+				rb_entry(rbp, struct ceph_pg_mapping, node);
+
+			rbp = rb_next(rbp);
+			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+			rb_erase(&cur->node, &map->pg_temp);
+			kfree(cur);
+		}
+
+		if (pglen) {
+			/* insert */
+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+			if (!pg) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pg->pgid = pgid;
+			pg->len = pglen;
+			for (j = 0; j < pglen; j++)
+				pg->osds[j] = ceph_decode_32(p);
+			err = __insert_pg_mapping(pg, &map->pg_temp);
+			if (err) {
+				kfree(pg);
+				goto bad;
+			}
+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+			     pglen);
+		}
+	}
+	while (rbp) {
+		struct ceph_pg_mapping *cur =
+			rb_entry(rbp, struct ceph_pg_mapping, node);
+
+		rbp = rb_next(rbp);
+		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+		rb_erase(&cur->node, &map->pg_temp);
+		kfree(cur);
+	}
+
+	/* ignore the rest */
+	*p = end;
+	return map;
+
+bad:
+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+	       epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	if (newcrush)
+		crush_destroy(newcrush);
+	return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+				   u64 off, u64 *plen,
+				   u64 *ono,
+				   u64 *oxoff, u64 *oxlen)
+{
+	u32 osize = le32_to_cpu(layout->fl_object_size);
+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 bl, stripeno, stripepos, objsetno;
+	u32 su_per_object;
+	u64 t, su_offset;
+
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	     osize, su);
+	su_per_object = osize / su;
+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
+	     su_per_object);
+
+	BUG_ON((su & ~PAGE_MASK) != 0);
+	/* bl = *off / su; */
+	t = off;
+	do_div(t, su);
+	bl = t;
+	dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+	stripeno = bl / sc;
+	stripepos = bl % sc;
+	objsetno = stripeno / su_per_object;
+
+	*ono = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+	t = off;
+	su_offset = do_div(t, su);
+	*oxoff = su_offset + (stripeno % su_per_object) * su;
+
+	/*
+	 * Calculate the length of the extent being written to the selected
+	 * object. This is the minimum of the full length requested (plen) or
+	 * the remainder of the current stripe being written to.
+	 */
+	*oxlen = min_t(u64, *plen, su - su_offset);
+	*plen = *oxlen;
+
+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+			    const char *oid,
+			    struct ceph_file_layout *fl,
+			    struct ceph_osdmap *osdmap)
+{
+	unsigned num, num_mask;
+	struct ceph_pg pgid;
+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg_pool_info *pool;
+	unsigned ps;
+
+	BUG_ON(!osdmap);
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return -EIO;
+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	if (preferred >= 0) {
+		ps += preferred;
+		num = le32_to_cpu(pool->v.lpg_num);
+		num_mask = pool->lpg_num_mask;
+	} else {
+		num = le32_to_cpu(pool->v.pg_num);
+		num_mask = pool->pg_num_mask;
+	}
+
+	pgid.ps = cpu_to_le16(ps);
+	pgid.preferred = cpu_to_le16(preferred);
+	pgid.pool = fl->fl_pg_pool;
+	if (preferred >= 0)
+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+		     (int)preferred);
+	else
+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+	ol->ol_pgid = pgid;
+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_calc_object_layout);
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *osds, int *num)
+{
+	struct ceph_pg_mapping *pg;
+	struct ceph_pg_pool_info *pool;
+	int ruleno;
+	unsigned poolid, ps, pps;
+	int preferred;
+
+	/* pg_temp? */
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		*num = pg->len;
+		return pg->osds;
+	}
+
+	/* crush */
+	poolid = le32_to_cpu(pgid.pool);
+	ps = le16_to_cpu(pgid.ps);
+	preferred = (s16)le16_to_cpu(pgid.preferred);
+
+	/* don't forcefeed bad device ids to crush */
+	if (preferred >= osdmap->max_osd ||
+	    preferred >= osdmap->crush->max_devices)
+		preferred = -1;
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return NULL;
+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+				 pool->v.type, pool->v.size);
+	if (ruleno < 0) {
+		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
+		       poolid, pool->v.crush_ruleset, pool->v.type,
+		       pool->v.size);
+		return NULL;
+	}
+
+	if (preferred >= 0)
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.lpgp_num),
+				      pool->lpgp_num_mask);
+	else
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.pgp_num),
+				      pool->pgp_num_mask);
+	pps += poolid;
+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			     min_t(int, pool->v.size, *num),
+			     preferred, osdmap->osd_weight);
+	return osds;
+}
+
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *acting)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, o, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	o = 0;
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			acting[o++] = osds[i];
+	return o;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			return osds[i];
+	return -1;
+}
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..3b146cfac182
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,65 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+	struct page *page = list_entry(pl->head.prev, struct page,
+				       lru);
+	kunmap(page);
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail)
+		ceph_pagelist_unmap_tail(pl);
+
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page = __page_cache_alloc(GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	list_add_tail(&page->lru, &pl->head);
+	if (pl->mapped_tail)
+		ceph_pagelist_unmap_tail(pl);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..54caf0687155
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+						 int num_pages,
+						 loff_t off, size_t len)
+{
+	struct page **pages;
+	int rc;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_user_pages(current, current->mm, (unsigned long)data,
+			    num_pages, 0, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = __page_cache_alloc(flags);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+					 const char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(page_address(pages[i]) + po, data, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(data, page_address(pages[i]) + po, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+					 char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	off &= ~PAGE_CACHE_MASK;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+
+	/* leading partial page? */
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
-- 
cgit v1.2.3


From 4c32f5dda5ffe23687a55da1538b7cc426710d1a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 24 Aug 2010 16:27:36 -0700
Subject: ceph: do not hide .snap in root directory

Snaps in the root directory are now supported by the MDS, and harmless on
older versions.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a8d2aacc612b..495825268cb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -493,7 +493,6 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
-	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
 	    strcmp(dentry->d_name.name,
 		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
-- 
cgit v1.2.3


From 93afd449aa3c0430ef409c13e1cb2b3f0458fc10 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 17 Sep 2010 08:38:25 -0700
Subject: ceph: only invalidate on check_caps if we actually have pages

The i_rdcache_gen value only implies we MAY have cached pages; actually
check the mapping to see if it's worth bothering with an invalidate.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3cff67cbb9c0..3c03460f48bd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1534,7 +1534,7 @@ retry_locked:
 	 */
 	if ((!is_delayed || mdsc->stopping) &&
 	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-	    ci->i_rdcache_gen &&                     /* may have cached pages */
+	    inode->i_data.nrpages &&                 /* have cached pages */
 	    (file_wanted == 0 ||                     /* no open files */
 	     (revoking & (CEPH_CAP_FILE_CACHE|
 			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
-- 
cgit v1.2.3


From 18a38193efcaac1fb3c94ad8fa04bb117850a3c2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 17 Sep 2010 10:46:44 -0700
Subject: ceph: use mapping->nrpages to determine if mapping is empty

This is simpler and faster.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3c03460f48bd..98ab13e2b71d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1417,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
 /*
  * try to invalidate mapping pages without blocking.
  */
-static int mapping_is_empty(struct address_space *mapping)
-{
-	struct page *page = find_get_page(mapping, 0);
-
-	if (!page)
-		return 1;
-
-	put_page(page);
-	return 0;
-}
-
 static int try_nonblocking_invalidate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1437,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	invalidate_mapping_pages(&inode->i_data, 0, -1);
 	spin_lock(&inode->i_lock);
 
-	if (mapping_is_empty(&inode->i_data) &&
+	if (inode->i_data.nrpages == 0 &&
 	    invalidating_gen == ci->i_rdcache_gen) {
 		/* success. */
 		dout("try_nonblocking_invalidate %p success\n", inode);
-- 
cgit v1.2.3


From fca4451acfdcf894154e4809529ca28a09db88ff Mon Sep 17 00:00:00 2001
From: Greg Farnum <gregf@hq.newdream.net>
Date: Fri, 17 Sep 2010 10:24:02 -0700
Subject: ceph: preallocate flock state without locks held

When the lock_kernel() turns into lock_flocks() and a spinlock, we won't
be able to do allocations with the lock held.  Preallocate space without
the lock, and retry if the lock state changes out from underneath us.

Signed-off-by: Greg Farnum <gregf@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/locks.c      | 17 +++++++++++++++--
 fs/ceph/mds_client.c | 42 +++++++++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 087afe9ca367..40abde93c345 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
  * Encode the flock and fcntl locks for the given inode into the pagelist.
  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
  * sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
- * been gathered under the same lock holding window.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
  */
 int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 		      int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 	struct file_lock *lock;
 	struct ceph_filelock cephlock;
 	int err = 0;
+	int seen_fcntl = 0;
+	int seen_flock = 0;
 
 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
 	     num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 		goto fail;
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_POSIX) {
+			++seen_fcntl;
+			if (seen_fcntl > num_fcntl_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
 			err = lock_to_ceph_filelock(lock, &cephlock);
 			if (err)
 				goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 		goto fail;
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_FLOCK) {
+			++seen_flock;
+			if (seen_flock > num_flock_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
 			err = lock_to_ceph_filelock(lock, &cephlock);
 			if (err)
 				goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 33568239a08e..fbfc298ac55b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2365,19 +2365,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 	if (recon_state->flock) {
 		int num_fcntl_locks, num_flock_locks;
-
-		lock_kernel();
-		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-		rec.v2.flock_len = (2*sizeof(u32) +
-				    (num_fcntl_locks+num_flock_locks) *
-				    sizeof(struct ceph_filelock));
-
-		err = ceph_pagelist_append(pagelist, &rec, reclen);
-		if (!err)
-			err = ceph_encode_locks(inode, pagelist,
-						num_fcntl_locks,
-						num_flock_locks);
-		unlock_kernel();
+		struct ceph_pagelist_cursor trunc_point;
+
+		ceph_pagelist_set_cursor(pagelist, &trunc_point);
+		do {
+			lock_kernel();
+			ceph_count_locks(inode, &num_fcntl_locks,
+					 &num_flock_locks);
+			rec.v2.flock_len = (2*sizeof(u32) +
+					    (num_fcntl_locks+num_flock_locks) *
+					    sizeof(struct ceph_filelock));
+			unlock_kernel();
+
+			/* pre-alloc pagelist */
+			ceph_pagelist_truncate(pagelist, &trunc_point);
+			err = ceph_pagelist_append(pagelist, &rec, reclen);
+			if (!err)
+				err = ceph_pagelist_reserve(pagelist,
+							    rec.v2.flock_len);
+
+			/* encode locks */
+			if (!err) {
+				lock_kernel();
+				err = ceph_encode_locks(inode,
+							pagelist,
+							num_fcntl_locks,
+							num_flock_locks);
+				unlock_kernel();
+			}
+		} while (err == -ENOSPC);
 	} else {
 		err = ceph_pagelist_append(pagelist, &rec, reclen);
 	}
-- 
cgit v1.2.3


From 496e59553c51ce18acc836de070106b583926b87 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 22 Sep 2010 19:57:10 -0700
Subject: ceph: switch from BKL to lock_flocks()

Switch from using the BKL explicitly to the new lock_flocks() interface.
Eventually this will turn into a spinlock.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fbfc298ac55b..3142b15940c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,11 +1,12 @@
 #include <linux/ceph/ceph_debug.h>
 
+#include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -2369,13 +2370,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 		ceph_pagelist_set_cursor(pagelist, &trunc_point);
 		do {
-			lock_kernel();
+			lock_flocks();
 			ceph_count_locks(inode, &num_fcntl_locks,
 					 &num_flock_locks);
 			rec.v2.flock_len = (2*sizeof(u32) +
 					    (num_fcntl_locks+num_flock_locks) *
 					    sizeof(struct ceph_filelock));
-			unlock_kernel();
+			unlock_flocks();
 
 			/* pre-alloc pagelist */
 			ceph_pagelist_truncate(pagelist, &trunc_point);
@@ -2386,12 +2387,12 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 			/* encode locks */
 			if (!err) {
-				lock_kernel();
+				lock_flocks();
 				err = ceph_encode_locks(inode,
 							pagelist,
 							num_fcntl_locks,
 							num_flock_locks);
-				unlock_kernel();
+				unlock_flocks();
 			}
 		} while (err == -ENOSPC);
 	} else {
-- 
cgit v1.2.3


From 6f453ed6c07dbed83b368269c9c0fb170866ee71 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 28 Sep 2010 09:53:10 -0700
Subject: ceph: fix debugfs warnings

Include "super.h" outside of CONFIG_DEBUG_FS to eliminate a compiler warning:

fs/ceph/debugfs.c:266: warning: 'struct ceph_fs_client' declared inside parameter list
fs/ceph/debugfs.c:266: warning: its scope is only this definition or declaration, which is probably not what you want
fs/ceph/debugfs.c:271: warning: 'struct ceph_fs_client' declared inside parameter list

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/debugfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 94fe1d284c3a..7ae1b3d55b58 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -12,9 +12,10 @@
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
 
+#include "super.h"
+
 #ifdef CONFIG_DEBUG_FS
 
-#include "super.h"
 #include "mds_client.h"
 
 static int mdsmap_show(struct seq_file *s, void *p)
-- 
cgit v1.2.3


From 571dba52a34015a5a7aa5d480a86936878444a6f Mon Sep 17 00:00:00 2001
From: Greg Farnum <gregf@hq.newdream.net>
Date: Fri, 24 Sep 2010 14:56:40 -0700
Subject: ceph: add CEPH_MDS_OP_SETDIRLAYOUT and associated ioctl.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ioctl.c              | 66 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/ioctl.h              |  4 ++-
 include/linux/ceph/ceph_fs.h |  1 +
 3 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 899578b0c46b..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -91,6 +91,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	return err;
 }
 
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	        (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+				       USE_AUTH_MDS);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+			cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+			cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+			cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool =
+			cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+			cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
 /*
  * Return object name, size/offset information, and location (OSD
  * number, network address) for a given file offset.
@@ -177,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case CEPH_IOC_SET_LAYOUT:
 		return ceph_ioctl_set_layout(file, (void __user *)arg);
 
+	case CEPH_IOC_SET_LAYOUT_POLICY:
+		return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
 	case CEPH_IOC_GET_DATALOC:
 		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
 
 	case CEPH_IOC_LAZYIO:
 		return ceph_ioctl_lazyio(file);
 	}
+
 	return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
 
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
 				   struct ceph_ioctl_layout)
 #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
 				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,	\
+				   struct ceph_ioctl_layout)
 
 /*
  * Extract identity, address of the OSD and object storing a given
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d5619ac86711..c3c74aef289d 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -299,6 +299,7 @@ enum {
 	CEPH_MDS_OP_SETATTR    = 0x01108,
 	CEPH_MDS_OP_SETFILELOCK= 0x01109,
 	CEPH_MDS_OP_GETFILELOCK= 0x00110,
+	CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
 
 	CEPH_MDS_OP_MKNOD      = 0x01201,
 	CEPH_MDS_OP_LINK       = 0x01202,
-- 
cgit v1.2.3


From 61413c2f594e6b63db2b14c70c2e7d8cf02f9c00 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sun, 17 Oct 2010 21:55:21 +0200
Subject: fs/ceph/xattr.c: Use kmemdup

Convert a sequence of kmalloc and memcpy to use kmemdup.

The semantic patch that performs this transformation is:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression a,flag,len;
expression arg,e1,e2;
statement S;
@@

  a =
-  \(kmalloc\|kzalloc\)(len,flag)
+  kmemdup(arg,len,flag)
  <... when != a
  if (a == NULL || ...) S
  ...>
- memcpy(a,arg,len+1);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 70e919960acb..6e12a6ba5f79 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -716,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
-	newname = kmalloc(name_len + 1, GFP_NOFS);
+	newname = kmemdup(name, name_len + 1, GFP_NOFS);
 	if (!newname)
 		goto out;
-	memcpy(newname, name, name_len + 1);
 
 	if (val_len) {
 		newval = kmalloc(val_len + 1, GFP_NOFS);
-- 
cgit v1.2.3


From efa4c1206eaff047c474af2136748a58eb8cc33b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 18 Oct 2010 14:04:31 -0700
Subject: ceph: do not carry i_lock for readdir from dcache

We were taking dcache_lock inside of i_lock, which introduces a dependency
not found elsewhere in the kernel, complicationg the vfs locking
scalability work.  Since we don't actually need it here anyway, remove
it.

We only need i_lock to test for the I_COMPLETE flag, so be careful to do
so without dcache_lock held.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 41 ++++++++++++++++-------------------------
 fs/ceph/super.h |  3 ++-
 2 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 495825268cb9..e0a2dc6fcafc 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -95,10 +95,7 @@ static unsigned fpos_off(loff_t p)
  */
 static int __dcache_readdir(struct file *filp,
 			    void *dirent, filldir_t filldir)
-		__releases(inode->i_lock)
-		__acquires(inode->i_lock)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_file_info *fi = filp->private_data;
 	struct dentry *parent = filp->f_dentry;
 	struct inode *dir = parent->d_inode;
@@ -154,7 +151,6 @@ more:
 
 	atomic_inc(&dentry->d_count);
 	spin_unlock(&dcache_lock);
-	spin_unlock(&inode->i_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -172,35 +168,30 @@ more:
 		} else {
 			dput(last);
 		}
-		last = NULL;
 	}
-
-	spin_lock(&inode->i_lock);
-	spin_lock(&dcache_lock);
-
 	last = dentry;
 
 	if (err < 0)
-		goto out_unlock;
+		goto out;
 
-	p = p->prev;
 	filp->f_pos++;
 
 	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
-	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
-		goto more;
-	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-	err = -EAGAIN;
+	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock(&dcache_lock);
+	p = p->prev;	/* advance to next dentry */
+	goto more;
 
 out_unlock:
 	spin_unlock(&dcache_lock);
-
-	if (last) {
-		spin_unlock(&inode->i_lock);
+out:
+	if (last)
 		dput(last);
-		spin_lock(&inode->i_lock);
-	}
-
 	return err;
 }
 
@@ -272,13 +263,13 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
 	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		spin_unlock(&inode->i_lock);
 		err = __dcache_readdir(filp, dirent, filldir);
-		if (err != -EAGAIN) {
-			spin_unlock(&inode->i_lock);
+		if (err != -EAGAIN)
 			return err;
-		}
+	} else {
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	if (fi->dentry) {
 		err = note_last_dentry(fi, fi->dentry->d_name.name,
 				       fi->dentry->d_name.len);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e2e904442ce2..1886294e12f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -400,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool r;
 
-	smp_mb();
+	spin_lock(&inode->i_lock);
 	r = (ci->i_ceph_flags & mask) == mask;
+	spin_unlock(&inode->i_lock);
 	return r;
 }
 
-- 
cgit v1.2.3


From 39037559e651c417fb68b828926dc61cd5d6e5e2 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 18 Oct 2010 08:28:50 +0300
Subject: UBIFS: remove a bit of unneeded code

This is a clean-up patch which:

1. Removes explicite 'hrtimer_cancel()' after 'ubifs_wbuf_sync()' in
   'ubifs_remount_ro()', because the timers will be canceled by
   'ubifs_wbuf_sync()', no need to cancel them for the second time.
2. Remove "if (c->jheads)" check from 'ubifs_put_super()', because
   at journal heads must always be allocated there, since we checked
   earlier that we were mounted R/W, and the olny situation when
   journal heads are not allocated is when mounter or re-mounted R/O.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index bb6ed5da2d10..6ff2f802bd7e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1686,10 +1686,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 
 	dbg_save_space_info(c);
 
-	for (i = 0; i < c->jhead_cnt; i++) {
+	for (i = 0; i < c->jhead_cnt; i++)
 		ubifs_wbuf_sync(&c->jheads[i].wbuf);
-		hrtimer_cancel(&c->jheads[i].wbuf.timer);
-	}
 
 	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
 	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1754,9 +1752,8 @@ static void ubifs_put_super(struct super_block *sb)
 			int err;
 
 			/* Synchronize write-buffers */
-			if (c->jheads)
-				for (i = 0; i < c->jhead_cnt; i++)
-					ubifs_wbuf_sync(&c->jheads[i].wbuf);
+			for (i = 0; i < c->jhead_cnt; i++)
+				ubifs_wbuf_sync(&c->jheads[i].wbuf);
 
 			/*
 			 * We are being cleanly unmounted which means the
-- 
cgit v1.2.3


From 3601ba27353a968df843454e4b81155376682505 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 18 Oct 2010 08:32:35 +0300
Subject: UBIFS: do not forget to cancel timers

This is a bug-fix: when we unmount, and we are currently in R/O
mode because of an error - we do not sync write-buffers, which
means we also do not cancel write-buffer timers we may possibly
have armed. This patch fixes the issue.

The issue can easily be reproduced by enabling UBIFS failure debug
mode (echo 4 > /sys/module/ubifs/parameters/debug_tsts) and
unmounting as soon as a failure happen. At some point the system
oopses because we have an armed hrtimer but UBIFS is unmounted
already.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6ff2f802bd7e..9a47c9f0ad07 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1772,6 +1772,10 @@ static void ubifs_put_super(struct super_block *sb)
 				 */
 				ubifs_err("failed to write master node, "
 					  "error %d", err);
+		} else {
+			for (i = 0; i < c->jhead_cnt; i++)
+				/* Make sure write-buffer timers are canceled */
+				hrtimer_cancel(&c->jheads[i].wbuf.timer);
 		}
 	}
 
-- 
cgit v1.2.3


From 6599fcbd01baf9d57e847db103d215ea4ec088f9 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 18 Oct 2010 10:00:40 +0300
Subject: UBIFS: do not allocate unneeded scan buffer

In 'ubifs_replay_journal()' we allocate 'sbuf' for scanning the log.
However, we already have 'c->sbuf' for these purposes, so do not
allocate yet another one. This reduces UBIFS memory consumption while
recovering.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/replay.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 7df04ba4878e..eed0fcff8d73 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1013,7 +1013,6 @@ out:
 int ubifs_replay_journal(struct ubifs_info *c)
 {
 	int err, i, lnum, offs, free;
-	void *sbuf = NULL;
 
 	BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
 
@@ -1028,10 +1027,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
 		return -EINVAL;
 	}
 
-	sbuf = vmalloc(c->leb_size);
-	if (!sbuf)
-		return -ENOMEM;
-
 	dbg_mnt("start replaying the journal");
 	c->replaying = 1;
 	lnum = c->ltail_lnum = c->lhead_lnum;
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
 			lnum = UBIFS_LOG_LNUM;
 			offs = 0;
 		}
-		err = replay_log_leb(c, lnum, offs, sbuf);
+		err = replay_log_leb(c, lnum, offs, c->sbuf);
 		if (err == 1)
 			/* We hit the end of the log */
 			break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
 out:
 	destroy_replay_tree(c);
 	destroy_bud_list(c);
-	vfree(sbuf);
 	c->replaying = 0;
 	return err;
 }
-- 
cgit v1.2.3


From e5953cbdff26f7cbae7eff30cd9b18c4e19b7594 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Thu, 21 Oct 2010 14:56:00 +0200
Subject: pipe: fix failure to return error code on ->confirm()

The arguments were transposed, we want to assign the error code to
'ret', which is being returned.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51c..37eb1ebeaa90 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 			error = ops->confirm(pipe, buf);
 			if (error) {
 				if (!ret)
-					error = ret;
+					ret = error;
 				break;
 			}
 
-- 
cgit v1.2.3


From 3f9bcca7820a6711307b6499952b13cfcfc31dd6 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Mon, 18 Oct 2010 23:29:37 +0530
Subject: cifs: convert cifs_tcp_ses_lock from a rwlock to a spinlock

cifs_tcp_ses_lock is a rwlock with protects the cifs_tcp_ses_list,
server->smb_ses_list and the ses->tcon_list. It also protects a few
ref counters in server, ses and tcon. In most cases the critical section
doesn't seem to be large, in a few cases where it is slightly large, there
seem to be really no benefit from concurrent access. I briefly considered RCU
mechanism but it appears to me that there is no real need.

Replace it with a spinlock and get rid of the last rwlock in the cifs code.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 12 ++++-----
 fs/cifs/cifsfs.c     |  8 +++---
 fs/cifs/cifsglob.h   |  2 +-
 fs/cifs/cifssmb.c    |  6 ++---
 fs/cifs/connect.c    | 70 ++++++++++++++++++++++++++--------------------------
 fs/cifs/misc.c       | 14 +++++------
 fs/cifs/sess.c       |  4 +--
 7 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index eb1ba493489f..103ab8b605b0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -148,7 +148,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "Servers:");
 
 	i = 0;
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp1, &cifs_tcp_ses_list) {
 		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
@@ -230,7 +230,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			spin_unlock(&GlobalMid_Lock);
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
 	/* BB add code to dump additional info such as TCP session info now */
@@ -270,7 +270,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		atomic_set(&totBufAllocCount, 0);
 		atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-		read_lock(&cifs_tcp_ses_lock);
+		spin_lock(&cifs_tcp_ses_lock);
 		list_for_each(tmp1, &cifs_tcp_ses_list) {
 			server = list_entry(tmp1, struct TCP_Server_Info,
 					    tcp_ses_list);
@@ -303,7 +303,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 				}
 			}
 		}
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 	}
 
 	return count;
@@ -343,7 +343,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		GlobalCurrentXid, GlobalMaxActiveXid);
 
 	i = 0;
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp1, &cifs_tcp_ses_list) {
 		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
@@ -397,7 +397,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 			}
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	seq_putc(m, '\n');
 	return 0;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f1d9c71e807f..cb77915a445b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -482,16 +482,16 @@ static void cifs_umount_begin(struct super_block *sb)
 
 	tcon = cifs_sb_master_tcon(cifs_sb);
 
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
 		/* we have other mounts to same share or we have
 		   already tried to force umount this and woken up
 		   all waiting network requests, nothing to do */
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	} else if (tcon->tc_count == 1)
 		tcon->tidStatus = CifsExiting;
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
@@ -940,7 +940,7 @@ init_cifs(void)
 	GlobalTotalActiveXid = 0;
 	GlobalMaxActiveXid = 0;
 	memset(Local_System_Name, 0, 15);
-	rwlock_init(&cifs_tcp_ses_lock);
+	spin_lock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&cifs_file_list_lock);
 	spin_lock_init(&GlobalMid_Lock);
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 18ee0adda306..28337cba0295 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -703,7 +703,7 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
  * the reference counters for the server, smb session, and tcon. Finally,
  * changes to the tcon->tidStatus should be done while holding this lock.
  */
-GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
+GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
 
 /*
  * This lock protects the cifs_file->llist and cifs_file->flist
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bfb59a68e4fd..e98f1f317b15 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -593,9 +593,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			rc = -EIO;
 			goto neg_err_exit;
 		}
-		read_lock(&cifs_tcp_ses_lock);
+		spin_lock(&cifs_tcp_ses_lock);
 		if (server->srv_count > 1) {
-			read_unlock(&cifs_tcp_ses_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
@@ -605,7 +605,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 					16);
 			}
 		} else {
-			read_unlock(&cifs_tcp_ses_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
 			memcpy(server->server_GUID,
 			       pSMBr->u.extended_response.GUID, 16);
 		}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 019f00380d12..7e73176acb58 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -150,7 +150,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		ses->need_reconnect = true;
@@ -160,7 +160,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			tcon->need_reconnect = true;
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	/* do not want to be sending data on a socket we are freeing */
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
@@ -637,9 +637,9 @@ multi_t2_fnd:
 	} /* end while !EXITING */
 
 	/* take it off the list, if it's not already */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_del_init(&server->tcp_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
@@ -677,7 +677,7 @@ multi_t2_fnd:
 	 * BB: we shouldn't have to do any of this. It shouldn't be
 	 * possible to exit from the thread with active SMB sessions
 	 */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (list_empty(&server->pending_mid_q)) {
 		/* loop through server session structures attached to this and
 		    mark them dead */
@@ -687,7 +687,7 @@ multi_t2_fnd:
 			ses->status = CifsExiting;
 			ses->server = NULL;
 		}
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 	} else {
 		/* although we can not zero the server struct pointer yet,
 		since there are active requests which may depnd on them,
@@ -710,7 +710,7 @@ multi_t2_fnd:
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -733,12 +733,12 @@ multi_t2_fnd:
 	if a crazy root user tried to kill cifsd
 	kernel thread explicitly this might happen) */
 	/* BB: This shouldn't be necessary, see above */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		ses->server = NULL;
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
@@ -1524,7 +1524,7 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
 	struct TCP_Server_Info *server;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
 		if (!match_address(server, addr,
 				   (struct sockaddr *)&vol->srcaddr))
@@ -1534,11 +1534,11 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 			continue;
 
 		++server->srv_count;
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		cFYI(1, "Existing tcp session with server found");
 		return server;
 	}
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
@@ -1547,14 +1547,14 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
 	struct task_struct *task;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (--server->srv_count > 0) {
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
 
 	list_del_init(&server->tcp_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
@@ -1679,9 +1679,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	}
 
 	/* thread spawned, put it on the list */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	cifs_fscache_get_client_cookie(tcp_ses);
 
@@ -1703,7 +1703,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
 	struct cifsSesInfo *ses;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
 		switch (server->secType) {
 		case Kerberos:
@@ -1723,10 +1723,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 				continue;
 		}
 		++ses->ses_count;
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return ses;
 	}
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
@@ -1737,14 +1737,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	struct TCP_Server_Info *server = ses->server;
 
 	cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (--ses->ses_count > 0) {
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
 
 	list_del_init(&ses->smb_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	if (ses->status == CifsGood) {
 		xid = GetXid();
@@ -1841,9 +1841,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 		goto get_ses_fail;
 
 	/* success, put it on the list */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&ses->smb_ses_list, &server->smb_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	FreeXid(xid);
 	return ses;
@@ -1860,7 +1860,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 	struct list_head *tmp;
 	struct cifsTconInfo *tcon;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &ses->tcon_list) {
 		tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
 		if (tcon->tidStatus == CifsExiting)
@@ -1869,10 +1869,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 			continue;
 
 		++tcon->tc_count;
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return tcon;
 	}
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
@@ -1883,14 +1883,14 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	struct cifsSesInfo *ses = tcon->ses;
 
 	cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (--tcon->tc_count > 0) {
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
 
 	list_del_init(&tcon->tcon_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	xid = GetXid();
 	CIFSSMBTDis(xid, tcon);
@@ -1963,9 +1963,9 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
 	tcon->nocase = volume_info->nocase;
 	tcon->local_lease = volume_info->local_lease;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&tcon->tcon_list, &ses->tcon_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	cifs_fscache_get_super_cookie(tcon);
 
@@ -3225,9 +3225,9 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	vol_info->secFlg = CIFSSEC_MUST_KRB5;
 
 	/* get a reference for the same TCP session */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	++master_tcon->ses->server->srv_count;
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
 	if (IS_ERR(ses)) {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index de6073cccd9c..a7b492c213cd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -347,7 +347,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 				if (current_fsuid() != treeCon->ses->linux_uid) {
 					cFYI(1, "Multiuser mode and UID "
 						 "did not match tcon uid");
-					read_lock(&cifs_tcp_ses_lock);
+					spin_lock(&cifs_tcp_ses_lock);
 					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
 						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +361,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 							}
 						}
 					}
-					read_unlock(&cifs_tcp_ses_lock);
+					spin_unlock(&cifs_tcp_ses_lock);
 				}
 			}
 		}
@@ -551,7 +551,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		return false;
 
 	/* look up tcon based on tid & uid */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &srv->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		list_for_each(tmp1, &ses->tcon_list) {
@@ -573,7 +573,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				 */
 				if (netfile->closePend) {
 					spin_unlock(&cifs_file_list_lock);
-					read_unlock(&cifs_tcp_ses_lock);
+					spin_unlock(&cifs_tcp_ses_lock);
 					return true;
 				}
 
@@ -595,16 +595,16 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				netfile->oplock_break_cancelled = false;
 
 				spin_unlock(&cifs_file_list_lock);
-				read_unlock(&cifs_tcp_ses_lock);
+				spin_unlock(&cifs_tcp_ses_lock);
 				return true;
 			}
 			spin_unlock(&cifs_file_list_lock);
-			read_unlock(&cifs_tcp_ses_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, "No matching file for oplock break");
 			return true;
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	cFYI(1, "Can not process oplock break for non-existent connection");
 	return true;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index e35dc60d3255..2a11efd96592 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -80,7 +80,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
 	if (max_vcs < 2)
 		max_vcs = 0xFFFF;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
 			goto get_vc_num_exit;  /* vcnum will be zero */
 	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +112,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
 		vcnum = i;
 	ses->vcnum = vcnum;
 get_vc_num_exit:
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	return cpu_to_le16(vcnum);
 }
-- 
cgit v1.2.3


From 6de5bd128d381ad88ac6d419a5e597048eb468cf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 11 Sep 2010 18:00:57 +0200
Subject: BKL: introduce CONFIG_BKL.

With all the patches we have queued in the BKL removal tree, only a
few dozen modules are left that actually rely on the BKL, and even
there are lots of low-hanging fruit. We need to decide what to do
about them, this patch illustrates one of the options:

Every user of the BKL is marked as 'depends on BKL' in Kconfig,
and the CONFIG_BKL becomes a user-visible option. If it gets
disabled, no BKL using module can be built any more and the BKL
code itself is compiled out.

The one exception is file locking, which is practically always
enabled and does a 'select BKL' instead. This effectively forces
CONFIG_BKL to be enabled until we have solved the fs/lockd
mess and can apply the patch that removes the BKL from fs/locks.c.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/gpu/drm/Kconfig         | 5 ++++-
 drivers/media/Kconfig           | 1 +
 drivers/net/appletalk/Kconfig   | 1 +
 drivers/staging/cx25821/Kconfig | 1 +
 drivers/staging/easycap/Kconfig | 1 +
 drivers/staging/go7007/Kconfig  | 1 +
 drivers/staging/usbip/Kconfig   | 2 +-
 fs/Kconfig                      | 1 +
 fs/adfs/Kconfig                 | 1 +
 fs/autofs/Kconfig               | 1 +
 fs/hpfs/Kconfig                 | 1 +
 fs/nfs/Kconfig                  | 1 +
 fs/nfsd/Kconfig                 | 1 +
 fs/smbfs/Kconfig                | 1 +
 fs/udf/Kconfig                  | 1 +
 fs/ufs/Kconfig                  | 1 +
 include/linux/smp_lock.h        | 7 +++++--
 init/Kconfig                    | 2 +-
 lib/Kconfig.debug               | 9 +++++++++
 net/ipx/Kconfig                 | 1 +
 net/x25/Kconfig                 | 1 +
 21 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 4cab0c6397e3..7af443672626 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -73,7 +73,8 @@ source "drivers/gpu/drm/radeon/Kconfig"
 
 config DRM_I810
 	tristate "Intel I810"
-	depends on DRM && AGP && AGP_INTEL
+	# BKL usage in order to avoid AB-BA deadlocks, may become BROKEN_ON_SMP
+	depends on DRM && AGP && AGP_INTEL && BKL
 	help
 	  Choose this option if you have an Intel I810 graphics card.  If M is
 	  selected, the module will be called i810.  AGP support is required
@@ -86,6 +87,8 @@ choice
 
 config DRM_I830
 	tristate "i830 driver"
+	# BKL usage in order to avoid AB-BA deadlocks, i830 may get removed
+	depends on BKL
 	help
 	  Choose this option if you have a system that has Intel 830M, 845G,
 	  852GM, 855GM or 865G integrated graphics.  If M is selected, the
diff --git a/drivers/media/Kconfig b/drivers/media/Kconfig
index a28541b2b1a2..bad2cedb8d96 100644
--- a/drivers/media/Kconfig
+++ b/drivers/media/Kconfig
@@ -19,6 +19,7 @@ comment "Multimedia core support"
 
 config VIDEO_DEV
 	tristate "Video For Linux"
+	depends on BKL # used in many drivers for ioctl handling, need to kill
 	---help---
 	  V4L core support for video capture and overlay devices, webcams and
 	  AM/FM radio cards.
diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig
index 0a0e0cd81a23..20f97e7017ce 100644
--- a/drivers/net/appletalk/Kconfig
+++ b/drivers/net/appletalk/Kconfig
@@ -3,6 +3,7 @@
 #
 config ATALK
 	tristate "Appletalk protocol support"
+	depends on BKL # waiting to be removed from net/appletalk/ddp.c
 	select LLC
 	---help---
 	  AppleTalk is the protocol that Apple computers can use to communicate
diff --git a/drivers/staging/cx25821/Kconfig b/drivers/staging/cx25821/Kconfig
index df7756a95fad..813cb355ac01 100644
--- a/drivers/staging/cx25821/Kconfig
+++ b/drivers/staging/cx25821/Kconfig
@@ -1,6 +1,7 @@
 config VIDEO_CX25821
 	tristate "Conexant cx25821 support"
 	depends on DVB_CORE && VIDEO_DEV && PCI && I2C && INPUT
+	depends on BKL # please fix
 	select I2C_ALGOBIT
 	select VIDEO_BTCX
 	select VIDEO_TVEEPROM
diff --git a/drivers/staging/easycap/Kconfig b/drivers/staging/easycap/Kconfig
index bd96f39f2735..9d5fe4ddc30a 100644
--- a/drivers/staging/easycap/Kconfig
+++ b/drivers/staging/easycap/Kconfig
@@ -1,6 +1,7 @@
 config EASYCAP
 	tristate "EasyCAP USB ID 05e1:0408 support"
 	depends on USB && VIDEO_DEV
+	depends on BKL # please fix
 
 	---help---
 	  This is an integrated audio/video driver for EasyCAP cards with
diff --git a/drivers/staging/go7007/Kconfig b/drivers/staging/go7007/Kconfig
index e47f683a323e..75fa46805527 100644
--- a/drivers/staging/go7007/Kconfig
+++ b/drivers/staging/go7007/Kconfig
@@ -1,6 +1,7 @@
 config VIDEO_GO7007
 	tristate "WIS GO7007 MPEG encoder support"
 	depends on VIDEO_DEV && PCI && I2C && INPUT
+	depends on BKL # please fix
 	depends on SND
 	select VIDEOBUF_DMA_SG
 	select VIDEO_IR
diff --git a/drivers/staging/usbip/Kconfig b/drivers/staging/usbip/Kconfig
index 2c1d10acb8b5..b11ec379b5c2 100644
--- a/drivers/staging/usbip/Kconfig
+++ b/drivers/staging/usbip/Kconfig
@@ -1,6 +1,6 @@
 config USB_IP_COMMON
 	tristate "USB IP support (EXPERIMENTAL)"
-	depends on USB && NET && EXPERIMENTAL
+	depends on USB && NET && EXPERIMENTAL && BKL
 	default N
 	---help---
 	  This enables pushing USB packets over IP to allow remote
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..65781de44fc0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,6 +50,7 @@ endif # BLOCK
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
+	select BKL # while lockd still uses it.
 	help
 	  This option enables standard file locking support, required
           for filesystems like NFS and for the flock() system
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
 	depends on BLOCK && EXPERIMENTAL
+	depends on BKL # need to fix
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
index 5f3bea90911e..480e210c83ab 100644
--- a/fs/autofs/Kconfig
+++ b/fs/autofs/Kconfig
@@ -1,5 +1,6 @@
 config AUTOFS_FS
 	tristate "Kernel automounter support"
+	depends on BKL # unfixable, just use autofs4
 	help
 	  The automounter is a tool to automatically mount remote file systems
 	  on demand. This implementation is partially kernel-based to reduce
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
 	depends on BLOCK
+	depends on BKL # nontrivial to fix
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
 	  is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 6c2aad49d731..10b9524bb908 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,7 @@
 config NFS_FS
 	tristate "NFS client support"
 	depends on INET && FILE_LOCKING
+	depends on BKL # fix as soon as lockd is done
 	select LOCKD
 	select SUNRPC
 	select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 95932f523aef..429d4a142276 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -2,6 +2,7 @@ config NFSD
 	tristate "NFS server support"
 	depends on INET
 	depends on FILE_LOCKING
+	depends on BKL # fix as soon as lockd is done
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
index e668127c8b2e..2bc24a8c4039 100644
--- a/fs/smbfs/Kconfig
+++ b/fs/smbfs/Kconfig
@@ -1,5 +1,6 @@
 config SMB_FS
 	tristate "SMB file system support (OBSOLETE, please use CIFS)"
+	depends on BKL # probably unfixable
 	depends on INET
 	select NLS
 	help
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..f8def3c8ea4c 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
 config UDF_FS
 	tristate "UDF file system support"
+	depends on BKL # needs serious work to remove
 	select CRC_ITU_T
 	help
 	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
 config UFS_FS
 	tristate "UFS file system support (read only)"
 	depends on BLOCK
+	depends on BKL # probably fixable
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index 2ea1dd1ba21c..291f721144c2 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -54,12 +54,15 @@ static inline void cycle_kernel_lock(void)
 
 #else
 
+#ifdef CONFIG_BKL /* provoke build bug if not set */
 #define lock_kernel()
 #define unlock_kernel()
-#define release_kernel_lock(task)		do { } while(0)
 #define cycle_kernel_lock()			do { } while(0)
-#define reacquire_kernel_lock(task)		0
 #define kernel_locked()				1
+#endif /* CONFIG_BKL */
+
+#define release_kernel_lock(task)		do { } while(0)
+#define reacquire_kernel_lock(task)		0
 
 #endif /* CONFIG_LOCK_KERNEL */
 #endif /* __LINUX_SMPLOCK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd9..2005a1d49928 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -64,7 +64,7 @@ config BROKEN_ON_SMP
 
 config LOCK_KERNEL
 	bool
-	depends on SMP || PREEMPT
+	depends on (SMP || PREEMPT) && BKL
 	default y
 
 config INIT_ENV_ARG_LIMIT
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1b4afd2e6ca0..088eea1c2bef 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -461,6 +461,15 @@ config DEBUG_MUTEXES
 	 This feature allows mutex semantics violations to be detected and
 	 reported.
 
+config BKL
+	bool "Big Kernel Lock" if (SMP || PREEMPT)
+	default y
+	help
+	  This is the traditional lock that is used in old code instead
+	  of proper locking. All drivers that use the BKL should depend
+	  on this symbol.
+	  Say Y here unless you are working on removing the BKL.
+
 config DEBUG_LOCK_ALLOC
 	bool "Lock debugging: detect incorrect freeing of live locks"
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index e9ad0062fbb6..02549cb2c328 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -3,6 +3,7 @@
 #
 config IPX
 	tristate "The IPX protocol"
+	depends on BKL # should be fixable
 	select LLC
 	---help---
 	  This is support for the Novell networking protocol, IPX, commonly
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e6759c9660bb..2196e55e4f61 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -5,6 +5,7 @@
 config X25
 	tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
+	depends on BKL # should be fixable
 	---help---
 	  X.25 is a set of standardized network protocols, similar in scope to
 	  frame relay; the one physical line from your box to the X.25 network
-- 
cgit v1.2.3


From cd5b814458e5554457c6e62f17aed122145b065e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 2 Oct 2010 17:03:35 -0400
Subject: nfsd4: don't cache seq_misordered replies

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 02c23b7c5cd5..7f1282859cd6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1510,7 +1510,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		if (status) {
 			/* an unconfirmed replay returns misordered */
 			status = nfserr_seq_misordered;
-			goto out_cache;
+			goto out;
 		}
 
 		cs_slot->sl_seqid++; /* from 0 to 1 */
@@ -1549,7 +1549,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	       NFS4_MAX_SESSIONID_LEN);
 	cr_ses->seqid = cs_slot->sl_seqid;
 
-out_cache:
 	/* cache solo and embedded create sessions under the state lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
 out:
-- 
cgit v1.2.3


From edd76786633a3145661c7a90c9baccae8e3c9e84 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 14 Jun 2010 22:26:31 -0400
Subject: nfsd4: move callback setup into session init code

The backchannel should  be associated with a session, it isn't really
global to the client.

We do, however, want a pointer global to the client which tracks which
session we're currently using for client-based callbacks.

This is a first step in that direction; for now, just reshuffling of
code with no significant change in behavior.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 29 ++++++++++++++---------------
 fs/nfsd/state.h     |  1 +
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7f1282859cd6..db5d8c8537ed 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -771,6 +771,19 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 		free_session(&new->se_ref);
 		return nfserr_jukebox;
 	}
+	if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+		struct sockaddr *sa = svc_addr(rqstp);
+
+		clp->cl_cb_session = new;
+		clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+		svc_xprt_get(rqstp->rq_xprt);
+		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+		clp->cl_cb_conn.cb_minorversion = 1;
+		clp->cl_cb_conn.cb_prog = cses->callback_prog;
+		clp->cl_cb_seq_nr = 1;
+		nfsd4_probe_callback(clp, &clp->cl_cb_conn);
+	}
 	return nfs_ok;
 }
 
@@ -1045,7 +1058,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	clp->cl_flavor = rqstp->rq_flavor;
 	copy_cred(&clp->cl_cred, &rqstp->rq_cred);
 	gen_confirm(clp);
-
+	clp->cl_cb_session = NULL;
 	return clp;
 }
 
@@ -1515,20 +1528,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 		cs_slot->sl_seqid++; /* from 0 to 1 */
 		move_to_confirmed(unconf);
-
-		if (cr_ses->flags & SESSION4_BACK_CHAN) {
-			unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-			svc_xprt_get(rqstp->rq_xprt);
-			rpc_copy_addr(
-				(struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
-				sa);
-			unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-			unconf->cl_cb_conn.cb_minorversion =
-				cstate->minorversion;
-			unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-			unconf->cl_cb_seq_nr = 1;
-			nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
-		}
 		conf = unconf;
 	} else {
 		status = nfserr_stale_clientid;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 8d5e2370cce0..6e63c1d272bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -234,6 +234,7 @@ struct nfs4_client {
 	u32			cl_cb_ident;
 	atomic_t		cl_cb_set;
 	struct nfsd4_callback	cl_cb_null;
+	struct nfsd4_session	*cl_cb_session;
 
 	/* for all client information that callback code might need: */
 	spinlock_t		cl_lock;
-- 
cgit v1.2.3


From 90c8145bb6fe1d9e0a808de6a701748967588bbd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 14 Jun 2010 17:49:37 -0400
Subject: nfsd4: use client pointer to backchannel session

Instead of copying the sessionid, use the new cl_cb_session pointer,
which indicates which session we're using for the backchannel.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 9 +++++----
 fs/nfsd/nfs4state.c    | 4 +---
 fs/nfsd/state.h        | 1 -
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a269dbeff150..78ac779c09ff 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -251,6 +251,7 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		   struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 *p;
+	struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
 
 	if (hdr->minorversion == 0)
 		return;
@@ -258,7 +259,7 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
 
 	WRITE32(OP_CB_SEQUENCE);
-	WRITEMEM(cb->cb_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
 	WRITE32(cb->cb_clp->cl_cb_seq_nr);
 	WRITE32(0);		/* slotid, always 0 */
 	WRITE32(0);		/* highest slotid always 0 */
@@ -341,6 +342,7 @@ static int
 decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		   struct rpc_rqst *rqstp)
 {
+	struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
 	struct nfs4_sessionid id;
 	int status;
 	u32 dummy;
@@ -362,8 +364,7 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
 	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-	if (memcmp(id.data, cb->cb_clp->cl_sessionid.data,
-		   NFS4_MAX_SESSIONID_LEN)) {
+	if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out;
 	}
@@ -587,7 +588,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 		struct rpc_task *task)
 {
-	u32 *ptr = (u32 *)clp->cl_sessionid.data;
+	u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
 	int status = 0;
 
 	dprintk("%s: %u:%u:%u:%u\n", __func__,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index db5d8c8537ed..c942511f73e6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -753,8 +753,6 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 
 	new->se_client = clp;
 	gen_sessionid(new);
-	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
-	       NFS4_MAX_SESSIONID_LEN);
 
 	INIT_LIST_HEAD(&new->se_conns);
 
@@ -1544,7 +1542,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (status)
 		goto out;
 
-	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+	memcpy(cr_ses->sessionid.data, conf->cl_cb_session->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
 	cr_ses->seqid = cs_slot->sl_seqid;
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6e63c1d272bf..cdce26ad50b5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -243,7 +243,6 @@ struct nfs4_client {
 	struct list_head	cl_sessions;
 	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
-	struct nfs4_sessionid	cl_sessionid;
 	/* number of rpc's in progress over an associated session: */
 	atomic_t		cl_refcount;
 
-- 
cgit v1.2.3


From ac7c46f29a44f6d7f6d2e36dc874c0b7056acad2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 14 Jun 2010 19:01:57 -0400
Subject: nfsd4: make backchannel sequence number per-session

Currently we don't deal well with a client that has multiple sessions
associated with it (even simultaneously, or serially over the lifetime
of the client).

In particular, we don't attempt to keep the backchannel running after
the original session diseappears.

We will fix that soon.

Once we do that, we need the slot sequence number to be per-session;
otherwise, for example, we cannot correctly handle a case like this:

	- All session 1 connections are lost.
	- The client creates session 2.  We use it for the backchannel
	  (since it's the only working choice).
	- The client gives us a new connection to use with session 1.
	- The client destroys session 2.

At this point our only choice is to go back to using session 1.  When we
do so we must use the sequence number that is next for session 1.  We
therefore need to maintain multiple sequence number streams.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c |  8 ++++----
 fs/nfsd/nfs4state.c    | 22 ++++++++++++----------
 fs/nfsd/state.h        |  2 +-
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 78ac779c09ff..5df9dda47bf4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -260,7 +260,7 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 
 	WRITE32(OP_CB_SEQUENCE);
 	WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(cb->cb_clp->cl_cb_seq_nr);
+	WRITE32(ses->se_cb_seq_nr);
 	WRITE32(0);		/* slotid, always 0 */
 	WRITE32(0);		/* highest slotid always 0 */
 	WRITE32(0);		/* cachethis always 0 */
@@ -369,7 +369,7 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		goto out;
 	}
 	READ32(dummy);
-	if (dummy != cb->cb_clp->cl_cb_seq_nr) {
+	if (dummy != ses->se_cb_seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out;
 	}
@@ -643,11 +643,11 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 
 	if (clp->cl_cb_conn.cb_minorversion) {
 		/* No need for lock, access serialized in nfsd4_cb_prepare */
-		++clp->cl_cb_seq_nr;
+		++clp->cl_cb_session->se_cb_seq_nr;
 		clear_bit(0, &clp->cl_cb_slot_busy);
 		rpc_wake_up_next(&clp->cl_cb_waitq);
 		dprintk("%s: freed slot, new seqid=%d\n", __func__,
-			clp->cl_cb_seq_nr);
+			clp->cl_cb_session->se_cb_seq_nr);
 
 		/* We're done looking into the sequence information */
 		task->tk_msg.rpc_resp = NULL;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c942511f73e6..6367c445d015 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -725,8 +725,7 @@ void free_session(struct kref *kref)
 	kfree(ses);
 }
 
-
-static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
 	struct nfsd4_session *new;
 	struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
@@ -747,7 +746,7 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 	new = alloc_session(slotsize, numslots);
 	if (!new) {
 		nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
-		return nfserr_jukebox;
+		return NULL;
 	}
 	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
 
@@ -756,6 +755,7 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 
 	INIT_LIST_HEAD(&new->se_conns);
 
+	new->se_cb_seq_nr = 1;
 	new->se_flags = cses->flags;
 	kref_init(&new->se_ref);
 	idx = hash_sessionid(&new->se_sessionid);
@@ -765,9 +765,10 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 	spin_unlock(&client_lock);
 
 	status = nfsd4_new_conn(rqstp, new);
+	/* whoops: benny points out, status is ignored! (err, or bogus) */
 	if (status) {
 		free_session(&new->se_ref);
-		return nfserr_jukebox;
+		return NULL;
 	}
 	if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
 		struct sockaddr *sa = svc_addr(rqstp);
@@ -779,10 +780,9 @@ static __be32 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp
 		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
 		clp->cl_cb_conn.cb_minorversion = 1;
 		clp->cl_cb_conn.cb_prog = cses->callback_prog;
-		clp->cl_cb_seq_nr = 1;
 		nfsd4_probe_callback(clp, &clp->cl_cb_conn);
 	}
-	return nfs_ok;
+	return new;
 }
 
 /* caller must hold client_lock */
@@ -1485,6 +1485,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 {
 	struct sockaddr *sa = svc_addr(rqstp);
 	struct nfs4_client *conf, *unconf;
+	struct nfsd4_session *new;
 	struct nfsd4_clid_slot *cs_slot = NULL;
 	int status = 0;
 
@@ -1538,11 +1539,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	cr_ses->flags &= ~SESSION4_PERSIST;
 	cr_ses->flags &= ~SESSION4_RDMA;
 
-	status = alloc_init_session(rqstp, conf, cr_ses);
-	if (status)
+	status = nfserr_jukebox;
+	new = alloc_init_session(rqstp, conf, cr_ses);
+	if (!new)
 		goto out;
-
-	memcpy(cr_ses->sessionid.data, conf->cl_cb_session->se_sessionid.data,
+	status = nfs_ok;
+	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
 	cr_ses->seqid = cs_slot->sl_seqid;
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cdce26ad50b5..7f5b2671ef18 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -172,6 +172,7 @@ struct nfsd4_session {
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_channel_attrs se_bchannel;
 	struct list_head	se_conns;
+	u32			se_cb_seq_nr;
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
@@ -249,7 +250,6 @@ struct nfs4_client {
 	/* for nfs41 callbacks */
 	/* We currently support a single back channel with a single slot */
 	unsigned long		cl_cb_slot_busy;
-	u32			cl_cb_seq_nr;
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
 };
-- 
cgit v1.2.3


From 86c3e16cc7aace4d1143952813b6cc2a80c51295 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 2 Oct 2010 17:04:00 -0400
Subject: nfsd4: confirm only on succesful create_session

Following rfc 5661, section 18.36.4: "If the session is not successfully
created, then no changes are made to any client records on the server."
We shouldn't be confirming or incrementing the sequence id in this case.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6367c445d015..7e817d13cd82 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1487,6 +1487,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	struct nfs4_client *conf, *unconf;
 	struct nfsd4_session *new;
 	struct nfsd4_clid_slot *cs_slot = NULL;
+	bool confirm_me = false;
 	int status = 0;
 
 	nfs4_lock_state();
@@ -1509,7 +1510,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 				cs_slot->sl_seqid, cr_ses->seqid);
 			goto out;
 		}
-		cs_slot->sl_seqid++;
 	} else if (unconf) {
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1525,8 +1525,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			goto out;
 		}
 
-		cs_slot->sl_seqid++; /* from 0 to 1 */
-		move_to_confirmed(unconf);
+		confirm_me = true;
 		conf = unconf;
 	} else {
 		status = nfserr_stale_clientid;
@@ -1546,10 +1545,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	status = nfs_ok;
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
+	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
 	/* cache solo and embedded create sessions under the state lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
+	if (confirm_me)
+		move_to_confirmed(conf);
 out:
 	nfs4_unlock_state();
 	dprintk("%s returns %d\n", __func__, ntohl(status));
-- 
cgit v1.2.3


From d29c374cd20de620898d2936396048518809ae24 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Jun 2010 17:34:11 -0400
Subject: nfsd4: track backchannel connections

We need to keep track of which connections are available for use with
the backchannel, which for the forechannel, and which for both.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7e817d13cd82..c470cb78c6c1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -644,7 +644,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 	spin_unlock(&clp->cl_lock);
 }
 
-static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp)
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
 {
 	struct nfsd4_conn *conn;
 
@@ -653,7 +653,7 @@ static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp)
 		return NULL;
 	svc_xprt_get(rqstp->rq_xprt);
 	conn->cn_xprt = rqstp->rq_xprt;
-	conn->cn_flags = NFS4_CDFC4_FORE;
+	conn->cn_flags = flags;
 	INIT_LIST_HEAD(&conn->cn_xpt_user.list);
 	return conn;
 }
@@ -682,8 +682,11 @@ static void nfsd4_register_conn(struct nfsd4_conn *conn)
 static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 {
 	struct nfsd4_conn *conn;
+	u32 flags = NFS4_CDFC4_FORE;
 
-	conn = alloc_conn(rqstp);
+	if (ses->se_flags & SESSION4_BACK_CHAN)
+		flags |= NFS4_CDFC4_BACK;
+	conn = alloc_conn(rqstp, flags);
 	if (!conn)
 		return nfserr_jukebox;
 	nfsd4_hash_conn(conn, ses);
@@ -1640,7 +1643,7 @@ static void nfsd4_sequence_check_conn(struct svc_rqst *rqstp, struct nfsd4_sessi
 	if (c)
 		return;
 
-	new = alloc_conn(rqstp);
+	new = alloc_conn(rqstp, NFS4_CDFC4_FORE);
 
 	spin_lock(&clp->cl_lock);
 	c = __nfsd4_find_conn(rqstp, ses);
-- 
cgit v1.2.3


From 8b5ce5cd44743af84507721fa2cb4125ae67955c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 19 Oct 2010 17:31:50 -0400
Subject: nfsd4: callback program number is per-session

The callback program is allowed to depend on the session which the
callback is going over.

No change in behavior yet, while we still only do callbacks over a
single session for the lifetime of the client.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 1 +
 fs/nfsd/nfs4state.c    | 2 +-
 fs/nfsd/state.h        | 4 +++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5df9dda47bf4..140bb3656a24 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -498,6 +498,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 		return -EINVAL;
 	if (conn->cb_minorversion) {
 		args.bc_xprt = conn->cb_xprt;
+		args.prognumber = clp->cl_cb_session->se_cb_prog;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
 	}
 	/* Create RPC client */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c470cb78c6c1..59bc0011516b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -760,6 +760,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 
 	new->se_cb_seq_nr = 1;
 	new->se_flags = cses->flags;
+	new->se_cb_prog = cses->callback_prog;
 	kref_init(&new->se_ref);
 	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&client_lock);
@@ -782,7 +783,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
 		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
 		clp->cl_cb_conn.cb_minorversion = 1;
-		clp->cl_cb_conn.cb_prog = cses->callback_prog;
 		nfsd4_probe_callback(clp, &clp->cl_cb_conn);
 	}
 	return new;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7f5b2671ef18..b3bed366aba4 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -96,7 +96,8 @@ struct nfs4_cb_conn {
 	/* SETCLIENTID info */
 	struct sockaddr_storage	cb_addr;
 	size_t			cb_addrlen;
-	u32                     cb_prog;
+	u32                     cb_prog; /* used only in 4.0 case;
+					    per-session otherwise */
 	u32			cb_minorversion;
 	u32                     cb_ident;	/* minorversion 0 only */
 	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
@@ -172,6 +173,7 @@ struct nfsd4_session {
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_channel_attrs se_bchannel;
 	struct list_head	se_conns;
+	u32			se_cb_prog;
 	u32			se_cb_seq_nr;
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
-- 
cgit v1.2.3


From 5a3c9d71343cf27b7afef24ed312368d48dada09 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 19 Oct 2010 17:56:52 -0400
Subject: nfsd4: separate callback change and callback probe

Only one of the nfsd4_callback_probe callers actually cares about
changing the callback information.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 15 ++++++++++-----
 fs/nfsd/nfs4state.c    |  7 ++++---
 fs/nfsd/state.h        |  3 ++-
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 140bb3656a24..d38ee3c55a08 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -550,7 +550,7 @@ int set_callback_cred(void)
 
 static struct workqueue_struct *callback_wq;
 
-void do_probe_callback(struct nfs4_client *clp)
+static void do_probe_callback(struct nfs4_client *clp)
 {
 	struct nfsd4_callback *cb = &clp->cl_cb_null;
 
@@ -568,17 +568,22 @@ void do_probe_callback(struct nfs4_client *clp)
 }
 
 /*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
  */
-void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+void nfsd4_probe_callback(struct nfs4_client *clp)
+{
+	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	do_probe_callback(clp);
+}
+
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
 	BUG_ON(atomic_read(&clp->cl_cb_set));
 
 	spin_lock(&clp->cl_lock);
 	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
-	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
 	spin_unlock(&clp->cl_lock);
-	do_probe_callback(clp);
 }
 
 /*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 59bc0011516b..2327a8c00862 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -783,7 +783,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
 		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
 		clp->cl_cb_conn.cb_minorversion = 1;
-		nfsd4_probe_callback(clp, &clp->cl_cb_conn);
+		nfsd4_probe_callback(clp);
 	}
 	return new;
 }
@@ -1912,7 +1912,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			status = nfserr_clid_inuse;
 		else {
 			atomic_set(&conf->cl_cb_set, 0);
-			nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
+			nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+			nfsd4_probe_callback(conf);
 			expire_client(unconf);
 			status = nfs_ok;
 
@@ -1946,7 +1947,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			}
 			move_to_confirmed(unconf);
 			conf = unconf;
-			nfsd4_probe_callback(conf, &conf->cl_cb_conn);
+			nfsd4_probe_callback(conf);
 			status = nfs_ok;
 		}
 	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index b3bed366aba4..bbc4d587b341 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -453,7 +453,8 @@ extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
-- 
cgit v1.2.3


From 792c95dd519c54d6b0fd6401b3da7ea67b0d6b72 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 12 Oct 2010 19:55:25 -0400
Subject: nfsd4: delay session removal till free_client

Have unhash_client_locked() remove client and associated sessions from
global hashes, but delay further dismantling till free_client().

(After unhash_client_locked(), the only remaining references outside the
destroying thread are from any connections which have xpt_user callbacks
registered.)

This will simplify locking on session destruction.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2327a8c00862..0f2643dac22a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -883,6 +883,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				se_perclnt);
+		list_del(&ses->se_perclnt);
+		nfsd4_put_session(ses);
+	}
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -909,15 +916,12 @@ release_session_client(struct nfsd4_session *session)
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
+	struct nfsd4_session *ses;
+
 	mark_client_expired(clp);
 	list_del(&clp->cl_lru);
-	while (!list_empty(&clp->cl_sessions)) {
-		struct nfsd4_session  *ses;
-		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
-				 se_perclnt);
-		unhash_session(ses);
-		nfsd4_put_session(ses);
-	}
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+		list_del_init(&ses->se_hash);
 }
 
 static void
@@ -1031,6 +1035,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	if (clp == NULL)
 		return NULL;
 
+	INIT_LIST_HEAD(&clp->cl_sessions);
+
 	princ = svc_gss_principal(rqstp);
 	if (princ) {
 		clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -1047,7 +1053,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
-	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	spin_lock_init(&clp->cl_lock);
 	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
-- 
cgit v1.2.3


From 8323c3b2a6b6543919d5ebdddc7d52f192126161 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 19 Oct 2010 19:36:51 -0400
Subject: nfsd4: move minorversion to client

The minorversion seems more a property of the client than the callback
channel.

Some time we should probably also enforce consistent minorversion usage
from the client; for now, this is just a cosmetic change.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c |  8 ++++----
 fs/nfsd/nfs4state.c    | 12 ++++++++++--
 fs/nfsd/state.h        |  2 +-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d38ee3c55a08..67bcd2c72623 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -496,7 +496,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 
 	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
 		return -EINVAL;
-	if (conn->cb_minorversion) {
+	if (clp->cl_minorversion) {
 		args.bc_xprt = conn->cb_xprt;
 		args.prognumber = clp->cl_cb_session->se_cb_prog;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -620,7 +620,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
-	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
+	u32 minorversion = clp->cl_minorversion;
 	int status = 0;
 
 	cb->cb_minorversion = minorversion;
@@ -645,9 +645,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 	struct nfs4_client *clp = dp->dl_client;
 
 	dprintk("%s: minorversion=%d\n", __func__,
-		clp->cl_cb_conn.cb_minorversion);
+		clp->cl_minorversion);
 
-	if (clp->cl_cb_conn.cb_minorversion) {
+	if (clp->cl_minorversion) {
 		/* No need for lock, access serialized in nfsd4_cb_prepare */
 		++clp->cl_cb_session->se_cb_seq_nr;
 		clear_bit(0, &clp->cl_cb_slot_busy);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f2643dac22a..ce0412fd23eb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -782,7 +782,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
 		svc_xprt_get(rqstp->rq_xprt);
 		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
 		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-		clp->cl_cb_conn.cb_minorversion = 1;
 		nfsd4_probe_callback(clp);
 	}
 	return new;
@@ -1200,7 +1199,6 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 	if (conn->cb_addr.ss_family == AF_INET6)
 		((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
 
-	conn->cb_minorversion = 0;
 	conn->cb_prog = se->se_callback_prog;
 	conn->cb_ident = se->se_callback_ident;
 	return;
@@ -1540,6 +1538,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		goto out;
 	}
 
+	/*
+	 * XXX: we should probably set this at creation time, and check
+	 * for consistent minorversion use throughout:
+	 */
+	conf->cl_minorversion = 1;
 	/*
 	 * We do not support RDMA or persistent sessions
 	 */
@@ -1857,6 +1860,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		gen_clid(new);
 	}
+	/*
+	 * XXX: we should probably set this at creation time, and check
+	 * for consistent minorversion use throughout:
+	 */
+	new->cl_minorversion = 0;
 	gen_callback(new, setclid, rpc_get_scope_id(sa));
 	add_to_unconfirmed(new, strhashval);
 	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index bbc4d587b341..39adc27b0685 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -98,7 +98,6 @@ struct nfs4_cb_conn {
 	size_t			cb_addrlen;
 	u32                     cb_prog; /* used only in 4.0 case;
 					    per-session otherwise */
-	u32			cb_minorversion;
 	u32                     cb_ident;	/* minorversion 0 only */
 	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
 };
@@ -227,6 +226,7 @@ struct nfs4_client {
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	u32			cl_firststate;	/* recovery dir creation */
+	u32			cl_minorversion;
 
 	/* for v4.0 and v4.1 callbacks: */
 	struct nfs4_cb_conn	cl_cb_conn;
-- 
cgit v1.2.3


From 5d18c1c2a9a74e0f966c257520b8b7f5136c87b3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 19 Oct 2010 23:00:12 -0400
Subject: nfsd4: only require krb5 principal for NFSv4.0 callbacks

In the sessions backchannel case, we don't need a krb5 principal name
for the client; we use the already-created forechannel credentials
instead.

Some cleanup, while we're there: make it clearer which code here is 4.0-
or sessions- specific.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 67bcd2c72623..143da2eecd7b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -481,22 +481,24 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 	};
 	struct rpc_create_args args = {
 		.net		= &init_net,
-		.protocol	= XPRT_TRANSPORT_TCP,
 		.address	= (struct sockaddr *) &conn->cb_addr,
 		.addrsize	= conn->cb_addrlen,
 		.timeout	= &timeparms,
 		.program	= &cb_program,
-		.prognumber	= conn->cb_prog,
 		.version	= 0,
 		.authflavor	= clp->cl_flavor,
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
-		.client_name    = clp->cl_principal,
 	};
 	struct rpc_clnt *client;
 
-	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-		return -EINVAL;
-	if (clp->cl_minorversion) {
+	if (clp->cl_minorversion == 0) {
+		if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+			return -EINVAL;
+		args.client_name = clp->cl_principal;
+		args.prognumber	= conn->cb_prog,
+		args.protocol = XPRT_TRANSPORT_TCP;
+		clp->cl_cb_ident = conn->cb_ident;
+	} else {
 		args.bc_xprt = conn->cb_xprt;
 		args.prognumber = clp->cl_cb_session->se_cb_prog;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -508,7 +510,6 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 			PTR_ERR(client));
 		return PTR_ERR(client);
 	}
-	clp->cl_cb_ident = conn->cb_ident;
 	clp->cl_cb_client = client;
 	return 0;
 
-- 
cgit v1.2.3


From 073c21416268658bd1bc573af85eeac2ebb56ed5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Oct 2010 15:53:15 +0200
Subject: BKL: remove BKL from qnx4

All uses of the BKL in qnx4 were the result of a pushdown into
code that doesn't really need it. As Christoph points out, this
is a read-only file system, which eliminates most of the races in
readdir/lookup.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Anders Larsen <al@alarsen.net>
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/qnx4/dir.c   |  4 ----
 fs/qnx4/inode.c | 14 +-------------
 fs/qnx4/namei.c |  4 ----
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a8..7b0329468a5d 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
 
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
 	QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
 
-	lock_kernel();
-
 	while (filp->f_pos < inode->i_size) {
 		blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
 		bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		brelse(bh);
 	}
 out:
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 86a7be1399a8..01bad30026fc 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
-	lock_kernel();
-
 	buf->f_type    = sb->s_magic;
 	buf->f_bsize   = sb->s_blocksize;
 	buf->f_blocks  = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 
-	unlock_kernel();
-
 	return 0;
 }
 
@@ -234,13 +229,9 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	struct qnx4_sb_info *qs;
 	int ret = -EINVAL;
 
-	lock_kernel();
-
 	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
-	if (!qs) {
-		unlock_kernel();
+	if (!qs)
 		return -ENOMEM;
-	}
 	s->s_fs_info = qs;
 
 	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
@@ -287,8 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
  		goto outi;
 
 	brelse(bh);
-
-	unlock_kernel();
 	return 0;
 
       outi:
@@ -298,7 +287,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
       outnobh:
 	kfree(qs);
 	s->s_fs_info = NULL;
-	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba879..275327b5615e 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
 
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	int len = dentry->d_name.len;
 	struct inode *foundinode = NULL;
 
-	lock_kernel();
 	if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
 		goto out;
 	/* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
 
 	foundinode = qnx4_iget(dir->i_sb, ino);
 	if (IS_ERR(foundinode)) {
-		unlock_kernel();
 		QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
 			   PTR_ERR(foundinode)));
 		return ERR_CAST(foundinode);
 	}
 out:
-	unlock_kernel();
 	d_add(dentry, foundinode);
 
 	return NULL;
-- 
cgit v1.2.3


From 6d7bccc2215c37205ede6c9cf84db64e7c4f9443 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Oct 2010 15:58:01 +0200
Subject: BKL: remove BKL from freevxfs

All uses of the BKL in freevxfs were the result of a pushdown into
code that doesn't really need it. As Christoph points out, this
is a read-only file system, which eliminates most of the races in
readdir/lookup.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/freevxfs/vxfs_lookup.c | 14 ++------------
 fs/freevxfs/vxfs_super.c  | 10 ----------
 2 files changed, 2 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c6..6c5131d592f0 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
 #include <linux/highmem.h>
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include "vxfs.h"
 #include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
 	if (dp->d_name.len > VXFS_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 				 
-	lock_kernel();
 	ino = vxfs_inode_by_name(dip, dp);
 	if (ino) {
 		ip = vxfs_iget(dip->i_sb, ino);
-		if (IS_ERR(ip)) {
-			unlock_kernel();
+		if (IS_ERR(ip))
 			return ERR_CAST(ip);
-		}
 	}
-	unlock_kernel();
 	d_add(dp, ip);
 	return NULL;
 }
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 	u_long			page, npages, block, pblocks, nblocks, offset;
 	loff_t			pos;
 
-	lock_kernel();
-
 	switch ((long)fp->f_pos) {
 	case 0:
 		if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 
 	pos = fp->f_pos - 2;
 	
-	if (pos > VXFS_DIRROUND(ip->i_size)) {
-		unlock_kernel();
+	if (pos > VXFS_DIRROUND(ip->i_size))
 		return 0;
-	}
 
 	npages = dir_pages(ip);
 	nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 done:
 	fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
 out:
-	unlock_kernel();
 	return 0;
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index eb2b9e09c996..71b0148b8784 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
 {
 	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
 
-	lock_kernel();
-
 	vxfs_put_fake_inode(infp->vsi_fship);
 	vxfs_put_fake_inode(infp->vsi_ilist);
 	vxfs_put_fake_inode(infp->vsi_stilist);
 
 	brelse(infp->vsi_bp);
 	kfree(infp);
-
-	unlock_kernel();
 }
 
 /**
@@ -159,14 +154,11 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 	struct inode *root;
 	int ret = -EINVAL;
 
-	lock_kernel();
-
 	sbp->s_flags |= MS_RDONLY;
 
 	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
 	if (!infp) {
 		printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
-		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -239,7 +231,6 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 		goto out_free_ilist;
 	}
 
-	unlock_kernel();
 	return 0;
 	
 out_free_ilist:
@@ -249,7 +240,6 @@ out_free_ilist:
 out:
 	brelse(bp);
 	kfree(infp);
-	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3


From cdff08e76612e53580139653403aedea979aa639 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 21 Oct 2010 22:46:14 +0000
Subject: [CIFS] move close processing  from cifs_close to cifsFileInfo_put

Now that it's feasible for a cifsFileInfo to outlive the filp under
which it was created, move the close processing into cifsFileInfo_put.

This means that the last user of the filehandle always does the actual
on the wire close call. This also allows us to get rid of the closePend
flag from cifsFileInfo. If we have an active reference to the file
then it's never going to have a close pending.

cifs_close is converted to simply put the filehandle.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |   1 -
 fs/cifs/file.c     | 187 +++++++++++++++++------------------------------------
 fs/cifs/misc.c     |  10 ---
 3 files changed, 60 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 28337cba0295..3365e77f6f24 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -393,7 +393,6 @@ struct cifsFileInfo {
 	struct tcon_link *tlink;
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
-	bool closePend:1;	/* file is marked to close */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool oplock_break_cancelled:1;
 	atomic_t count;		/* reference count */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a3634e43bd4f..8c81e7b14d53 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -238,7 +238,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	pCifsFile->dentry = dget(dentry);
 	pCifsFile->f_flags = file->f_flags;
 	pCifsFile->invalidHandle = false;
-	pCifsFile->closePend = false;
 	pCifsFile->tlink = cifs_get_tlink(tlink);
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
@@ -266,14 +265,55 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	return pCifsFile;
 }
 
-/* Release a reference on the file private data */
+/*
+ * Release a reference on the file private data. This may involve closing
+ * the filehandle out on the server.
+ */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
-	if (atomic_dec_and_test(&cifs_file->count)) {
-		cifs_put_tlink(cifs_file->tlink);
-		dput(cifs_file->dentry);
-		kfree(cifs_file);
+	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+	struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
+	struct cifsLockInfo *li, *tmp;
+
+	spin_lock(&cifs_file_list_lock);
+	if (!atomic_dec_and_test(&cifs_file->count)) {
+		spin_unlock(&cifs_file_list_lock);
+		return;
+	}
+
+	/* remove it from the lists */
+	list_del(&cifs_file->flist);
+	list_del(&cifs_file->tlist);
+
+	if (list_empty(&cifsi->openFileList)) {
+		cFYI(1, "closing last open instance for inode %p",
+			cifs_file->dentry->d_inode);
+		cifsi->clientCanCacheRead = false;
+		cifsi->clientCanCacheAll  = false;
+	}
+	spin_unlock(&cifs_file_list_lock);
+
+	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
+		int xid, rc;
+
+		xid = GetXid();
+		rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
+		FreeXid(xid);
+	}
+
+	/* Delete any outstanding lock records. We'll lose them when the file
+	 * is closed anyway.
+	 */
+	mutex_lock(&cifs_file->lock_mutex);
+	list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+		list_del(&li->llist);
+		kfree(li);
 	}
+	mutex_unlock(&cifs_file->lock_mutex);
+
+	cifs_put_tlink(cifs_file->tlink);
+	dput(cifs_file->dentry);
+	kfree(cifs_file);
 }
 
 int cifs_open(struct inode *inode, struct file *file)
@@ -605,79 +645,11 @@ reopen_error_exit:
 
 int cifs_close(struct inode *inode, struct file *file)
 {
-	int rc = 0;
-	int xid, timeout;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	struct cifsFileInfo *pSMBFile = file->private_data;
-
-	xid = GetXid();
-
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = tlink_tcon(pSMBFile->tlink);
-	if (pSMBFile) {
-		struct cifsLockInfo *li, *tmp;
-		spin_lock(&cifs_file_list_lock);
-		pSMBFile->closePend = true;
-		if (pTcon) {
-			/* no sense reconnecting to close a file that is
-			   already closed */
-			if (!pTcon->need_reconnect) {
-				spin_unlock(&cifs_file_list_lock);
-				timeout = 2;
-				while ((atomic_read(&pSMBFile->count) != 1)
-					&& (timeout <= 2048)) {
-					/* Give write a better chance to get to
-					server ahead of the close.  We do not
-					want to add a wait_q here as it would
-					increase the memory utilization as
-					the struct would be in each open file,
-					but this should give enough time to
-					clear the socket */
-					cFYI(DBG2, "close delay, write pending");
-					msleep(timeout);
-					timeout *= 4;
-				}
-				if (!pTcon->need_reconnect &&
-				    !pSMBFile->invalidHandle)
-					rc = CIFSSMBClose(xid, pTcon,
-						  pSMBFile->netfid);
-			} else
-				spin_unlock(&cifs_file_list_lock);
-		} else
-			spin_unlock(&cifs_file_list_lock);
-
-		/* Delete any outstanding lock records.
-		   We'll lose them when the file is closed anyway. */
-		mutex_lock(&pSMBFile->lock_mutex);
-		list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
-			list_del(&li->llist);
-			kfree(li);
-		}
-		mutex_unlock(&pSMBFile->lock_mutex);
+	cifsFileInfo_put(file->private_data);
+	file->private_data = NULL;
 
-		spin_lock(&cifs_file_list_lock);
-		list_del(&pSMBFile->flist);
-		list_del(&pSMBFile->tlist);
-		spin_unlock(&cifs_file_list_lock);
-		cifsFileInfo_put(file->private_data);
-		file->private_data = NULL;
-	} else
-		rc = -EBADF;
-
-	spin_lock(&cifs_file_list_lock);
-	if (list_empty(&(CIFS_I(inode)->openFileList))) {
-		cFYI(1, "closing last open instance for inode %p", inode);
-		/* if the file is not open we do not know if we can cache info
-		   on this inode, much less write behind and read ahead */
-		CIFS_I(inode)->clientCanCacheRead = false;
-		CIFS_I(inode)->clientCanCacheAll  = false;
-	}
-	spin_unlock(&cifs_file_list_lock);
-	if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
-		rc = CIFS_I(inode)->write_behind_rc;
-	FreeXid(xid);
-	return rc;
+	/* return code from the ->release op is always ignored */
+	return 0;
 }
 
 int cifs_closedir(struct inode *inode, struct file *file)
@@ -1024,13 +996,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			   we blocked so return what we managed to write */
 				return total_written;
 			}
-			if (open_file->closePend) {
-				FreeXid(xid);
-				if (total_written)
-					return total_written;
-				else
-					return -EBADF;
-			}
 			if (open_file->invalidHandle) {
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
@@ -1111,13 +1076,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if (open_file->closePend) {
-				FreeXid(xid);
-				if (total_written)
-					return total_written;
-				else
-					return -EBADF;
-			}
 			if (open_file->invalidHandle) {
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
@@ -1197,8 +1155,6 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 	   are always at the end of the list but since the first entry might
 	   have a close pending, we go through the whole list */
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
-			continue;
 		if (fsuid_only && open_file->uid != current_fsuid())
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
@@ -1244,8 +1200,6 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 	spin_lock(&cifs_file_list_lock);
 refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
-			continue;
 		if (!any_available && open_file->pid != current->tgid)
 			continue;
 		if (fsuid_only && open_file->uid != current_fsuid())
@@ -1260,34 +1214,18 @@ refind_writable:
 			}
 
 			spin_unlock(&cifs_file_list_lock);
+
 			/* Had to unlock since following call can block */
 			rc = cifs_reopen_file(open_file, false);
-			if (!rc) {
-				if (!open_file->closePend)
-					return open_file;
-				else { /* start over in case this was deleted */
-				       /* since the list could be modified */
-					spin_lock(&cifs_file_list_lock);
-					cifsFileInfo_put(open_file);
-					goto refind_writable;
-				}
-			}
+			if (!rc)
+				return open_file;
 
-			/* if it fails, try another handle if possible -
-			(we can not do this if closePending since
-			loop could be modified - in which case we
-			have to start at the beginning of the list
-			again. Note that it would be bad
-			to hold up writepages here (rather than
-			in caller) with continuous retries */
+			/* if it fails, try another handle if possible */
 			cFYI(1, "wp failed on reopen file");
-			spin_lock(&cifs_file_list_lock);
-			/* can not use this handle, no write
-			   pending on this one after all */
 			cifsFileInfo_put(open_file);
 
-			if (open_file->closePend) /* list could have changed */
-				goto refind_writable;
+			spin_lock(&cifs_file_list_lock);
+
 			/* else we simply continue to the next entry. Thus
 			   we do not loop on reopen errors.  If we
 			   can not reopen the file, for example if we
@@ -1808,8 +1746,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 		smb_read_data = NULL;
 		while (rc == -EAGAIN) {
 			int buf_type = CIFS_NO_BUFFER;
-			if ((open_file->invalidHandle) &&
-			    (!open_file->closePend)) {
+			if (open_file->invalidHandle) {
 				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
@@ -1894,8 +1831,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		}
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if ((open_file->invalidHandle) &&
-			    (!open_file->closePend)) {
+			if (open_file->invalidHandle) {
 				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
@@ -2059,8 +1995,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				read_size, contig_pages);
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if ((open_file->invalidHandle) &&
-			    (!open_file->closePend)) {
+			if (open_file->invalidHandle) {
 				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
@@ -2212,8 +2147,6 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 
 	spin_lock(&cifs_file_list_lock);
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
-			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
 			spin_unlock(&cifs_file_list_lock);
 			return 1;
@@ -2372,7 +2305,7 @@ void cifs_oplock_break(struct work_struct *work)
 	 * not bother sending an oplock release if session to server still is
 	 * disconnected since oplock already released by the server
 	 */
-	if (!cfile->closePend && !cfile->oplock_break_cancelled) {
+	if (!cfile->oplock_break_cancelled) {
 		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
 				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
 		cFYI(1, "Oplock release rc = %d", rc);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a7b492c213cd..1c681f6a6803 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -567,16 +567,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				if (pSMB->Fid != netfile->netfid)
 					continue;
 
-				/*
-				 * don't do anything if file is about to be
-				 * closed anyway.
-				 */
-				if (netfile->closePend) {
-					spin_unlock(&cifs_file_list_lock);
-					spin_unlock(&cifs_tcp_ses_lock);
-					return true;
-				}
-
 				cFYI(1, "file id match, oplock break");
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 				pCifsInode->clientCanCacheAll = false;
-- 
cgit v1.2.3


From e52eec13cd6b7f30ab19081b387813e03e592ae5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 8 Sep 2010 16:54:17 +0200
Subject: SYSFS: Allow boot time switching between deprecated and modern sysfs
 layout

I have some systems which need legacy sysfs due to old tools that are
making assumptions that a directory can never be a symlink to another
directory, and it's a big hazzle to compile separate kernels for them.

This patch turns CONFIG_SYSFS_DEPRECATED into a run time option
that can be switched on/off the kernel command line. This way
the same binary can be used in both cases with just a option
on the command line.

The old CONFIG_SYSFS_DEPRECATED_V2 option is still there to set
the default. I kept the weird name to not break existing
config files.

Also the compat code can be still completely disabled by undefining
CONFIG_SYSFS_DEPRECATED_SWITCH -- just the optimizer takes
care of this now instead of lots of ifdefs. This makes the code
look nicer.

v2: This is an updated version on top of Kay's patch to only
handle the block devices. I tested it on my old systems
and that seems to work.

Cc: axboe@kernel.dk
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/kernel-parameters.txt |  9 +++++++++
 block/genhd.c                       |  7 ++-----
 drivers/base/class.c                |  4 ++--
 drivers/base/core.c                 | 26 +++++++++++++++++---------
 fs/partitions/check.c               | 19 +++++++++----------
 include/linux/device.h              |  7 +++++++
 init/Kconfig                        | 26 ++++++++++++++++++++++----
 7 files changed, 68 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3d854c9ae53a..67fa3fdc128f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2365,6 +2365,15 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	switches=	[HW,M68k]
 
+	sysfs.deprecated=0|1 [KNL]
+			Enable/disable old style sysfs layout for old udev
+			on older distributions. When this option is enabled
+			very new udev will not work anymore. When this option
+			is disabled (or CONFIG_SYSFS_DEPRECATED not compiled)
+			in older udev will not work anymore.
+			Default depends on CONFIG_SYSFS_DEPRECATED_V2 set in
+			the kernel configuration.
+
 	sysrq_always_enabled
 			[KNL]
 			Ignore sysrq setting - this boot parameter will
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..4e28a840bde0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -22,9 +22,7 @@
 #include "blk.h"
 
 static DEFINE_MUTEX(block_class_lock);
-#ifndef CONFIG_SYSFS_DEPRECATED
 struct kobject *block_depr;
-#endif
 
 /* for extended dynamic devt allocation, currently only one major is used */
 #define MAX_EXT_DEVT		(1 << MINORBITS)
@@ -803,10 +801,9 @@ static int __init genhd_device_init(void)
 
 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 
-#ifndef CONFIG_SYSFS_DEPRECATED
 	/* create top-level block dir */
-	block_depr = kobject_create_and_add("block", NULL);
-#endif
+	if (!sysfs_deprecated)
+		block_depr = kobject_create_and_add("block", NULL);
 	return 0;
 }
 
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 1078969889fd..9c63a5687d69 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -184,9 +184,9 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 	if (!cls->dev_kobj)
 		cls->dev_kobj = sysfs_dev_char_kobj;
 
-#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
+#if defined(CONFIG_BLOCK)
 	/* let the block class directory show up in the root of sysfs */
-	if (cls != &block_class)
+	if (!sysfs_deprecated || cls != &block_class)
 		cp->class_subsys.kobj.kset = class_kset;
 #else
 	cp->class_subsys.kobj.kset = class_kset;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 6cf9069f3150..f7f906f0a2f2 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -26,6 +26,19 @@
 #include "base.h"
 #include "power/power.h"
 
+#ifdef CONFIG_SYSFS_DEPRECATED
+#ifdef CONFIG_SYSFS_DEPRECATED_V2
+long sysfs_deprecated = 1;
+#else
+long sysfs_deprecated = 0;
+#endif
+static __init int sysfs_deprecated_setup(char *arg)
+{
+	return strict_strtol(arg, 10, &sysfs_deprecated);
+}
+early_param("sysfs.deprecated", sysfs_deprecated_setup);
+#endif
+
 int (*platform_notify)(struct device *dev) = NULL;
 int (*platform_notify_remove)(struct device *dev) = NULL;
 static struct kobject *dev_kobj;
@@ -617,14 +630,13 @@ static struct kobject *get_device_parent(struct device *dev,
 		struct kobject *parent_kobj;
 		struct kobject *k;
 
-#ifdef CONFIG_SYSFS_DEPRECATED
 		/* block disks show up in /sys/block */
-		if (dev->class == &block_class) {
+		if (sysfs_deprecated && dev->class == &block_class) {
 			if (parent && parent->class == &block_class)
 				return &parent->kobj;
 			return &block_class.p->class_subsys.kobj;
 		}
-#endif
+
 		/*
 		 * If we have no parent, we live in "virtual".
 		 * Class-devices with a non class-device as parent, live
@@ -707,11 +719,9 @@ static int device_add_class_symlinks(struct device *dev)
 			goto out_subsys;
 	}
 
-#ifdef CONFIG_SYSFS_DEPRECATED
 	/* /sys/block has directories and does not need symlinks */
-	if (dev->class == &block_class)
+	if (sysfs_deprecated && dev->class == &block_class)
 		return 0;
-#endif
 
 	/* link in the class directory pointing to the device */
 	error = sysfs_create_link(&dev->class->p->class_subsys.kobj,
@@ -738,10 +748,8 @@ static void device_remove_class_symlinks(struct device *dev)
 	if (dev->parent && device_is_not_partition(dev))
 		sysfs_remove_link(&dev->kobj, "device");
 	sysfs_remove_link(&dev->kobj, "subsystem");
-#ifdef CONFIG_SYSFS_DEPRECATED
-	if (dev->class == &block_class)
+	if (sysfs_deprecated && dev->class == &block_class)
 		return;
-#endif
 	sysfs_delete_link(&dev->class->p->class_subsys.kobj, &dev->kobj, dev_name(dev));
 }
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..137bf9787853 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -513,14 +513,14 @@ void register_disk(struct gendisk *disk)
 
 	if (device_add(ddev))
 		return;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	err = sysfs_create_link(block_depr, &ddev->kobj,
-				kobject_name(&ddev->kobj));
-	if (err) {
-		device_del(ddev);
-		return;
+	if (!sysfs_deprecated) {
+		err = sysfs_create_link(block_depr, &ddev->kobj,
+					kobject_name(&ddev->kobj));
+		if (err) {
+			device_del(ddev);
+			return;
+		}
 	}
-#endif
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -737,8 +737,7 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-#endif
+	if (!sysfs_deprecated)
+		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	device_del(disk_to_dev(disk));
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 516fecacf27b..dd4895313468 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -751,4 +751,11 @@ do {						     \
 	MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_CHARDEV_MAJOR(major) \
 	MODULE_ALIAS("char-major-" __stringify(major) "-*")
+
+#ifdef CONFIG_SYSFS_DEPRECATED
+extern long sysfs_deprecated;
+#else
+#define sysfs_deprecated 0
+#endif
+
 #endif /* _DEVICE_H_ */
diff --git a/init/Kconfig b/init/Kconfig
index 137609f33ebc..d742b6fca8d2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -660,8 +660,12 @@ config SYSFS_DEPRECATED
 	depends on SYSFS
 	default n
 	help
-	  This option switches the layout of the "block" class devices, to not
-	  show up in /sys/class/block/, but only in /sys/block/.
+	  This option adds code that switches the layout of the "block" class
+	  devices, to not show up in /sys/class/block/, but only in
+	  /sys/block/.
+
+	  This switch is only active when the sysfs.deprecated=1 boot option is
+	  passed or the SYSFS_DEPRECATED_V2 option is set.
 
 	  This option allows new kernels to run on old distributions and tools,
 	  which might get confused by /sys/class/block/. Since 2007/2008 all
@@ -672,8 +676,22 @@ config SYSFS_DEPRECATED
 	  option enabled.
 
 	  Only if you are using a new kernel on an old distribution, you might
-	  need to say Y here. Never say Y, if the original kernel, that came
-	  with your distribution, has not set this option.
+	  need to say Y here.
+
+config SYSFS_DEPRECATED_V2
+	bool "enabled deprecated sysfs features by default"
+	default n
+	depends on SYSFS
+	depends on SYSFS_DEPRECATED
+	help
+	  Enable deprecated sysfs by default.
+
+	  See the CONFIG_SYSFS_DEPRECATED option for more details about this
+	  option.
+
+	  Only if you are using a new kernel on an old distribution, you might
+	  need to say Y here. Even then, odds are you would not need it
+	  enabled, you can always pass the boot option if absolutely necessary.
 
 config RELAY
 	bool "Kernel->user space relay support (formerly relayfs)"
-- 
cgit v1.2.3


From a6849fa1f7d7d7adbeb6a696beeabfa078acf173 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Mon, 20 Sep 2010 00:56:27 -0700
Subject: sysfs: Fail bin file mmap if vma close is implemented.

It is not reasonably possible to wrap vma->close().  To correctly
wrap close would imply calling close on any vmas that remain when
sysfs_remove_bin_file is called.  Finding the proper lists walking
them getting the locking right etc, requires deep knowledge of the
mm subsystem and as such would require assistence from the mm
subsystem to implement.  That assistence does not currently exist.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353fa..d31d7b7d5426 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -190,23 +190,6 @@ static void bin_vma_open(struct vm_area_struct *vma)
 	sysfs_put_active(attr_sd);
 }
 
-static void bin_vma_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
-	if (!bb->vm_ops || !bb->vm_ops->close)
-		return;
-
-	if (!sysfs_get_active(attr_sd))
-		return;
-
-	bb->vm_ops->close(vma);
-
-	sysfs_put_active(attr_sd);
-}
-
 static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
@@ -331,7 +314,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 
 static const struct vm_operations_struct bin_vm_ops = {
 	.open		= bin_vma_open,
-	.close		= bin_vma_close,
 	.fault		= bin_fault,
 	.page_mkwrite	= bin_page_mkwrite,
 	.access		= bin_access,
@@ -377,6 +359,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
 		goto out_put;
 
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
 	rc = 0;
 	bb->mmapped = 1;
 	bb->vm_ops = vma->vm_ops;
-- 
cgit v1.2.3


From 38f49a5132f24d29236820eb5c7dd956e47f94a3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Mon, 20 Sep 2010 00:57:03 -0700
Subject: sysfs: only access bin file vm_ops with the active lock

bb->vm_ops is a cached copy of the vm_ops of the underlying
sysfs bin file, which means that after sysfs_bin_remove_file
completes it is only longer valid to deference bb->vm_ops.

So move all of the tests of bb->vm_ops inside of where
we hold the sysfs active lock.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d31d7b7d5426..a4759833d62d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,13 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
 	struct bin_buffer *bb = file->private_data;
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 
-	if (!bb->vm_ops || !bb->vm_ops->open)
+	if (!bb->vm_ops)
 		return;
 
 	if (!sysfs_get_active(attr_sd))
 		return;
 
-	bb->vm_ops->open(vma);
+	if (bb->vm_ops->open)
+		bb->vm_ops->open(vma);
 
 	sysfs_put_active(attr_sd);
 }
@@ -197,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->fault)
+	if (!bb->vm_ops)
 		return VM_FAULT_SIGBUS;
 
 	if (!sysfs_get_active(attr_sd))
 		return VM_FAULT_SIGBUS;
 
-	ret = bb->vm_ops->fault(vma, vmf);
+	ret = VM_FAULT_SIGBUS;
+	if (bb->vm_ops->fault)
+		ret = bb->vm_ops->fault(vma, vmf);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -219,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!bb->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!bb->vm_ops->page_mkwrite)
-		return 0;
-
 	if (!sysfs_get_active(attr_sd))
 		return VM_FAULT_SIGBUS;
 
-	ret = bb->vm_ops->page_mkwrite(vma, vmf);
+	ret = 0;
+	if (bb->vm_ops->page_mkwrite)
+		ret = bb->vm_ops->page_mkwrite(vma, vmf);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -239,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->access)
+	if (!bb->vm_ops)
 		return -EINVAL;
 
 	if (!sysfs_get_active(attr_sd))
 		return -EINVAL;
 
-	ret = bb->vm_ops->access(vma, addr, buf, len, write);
+	ret = -EINVAL;
+	if (bb->vm_ops->access)
+		ret = bb->vm_ops->access(vma, addr, buf, len, write);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -259,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->set_policy)
+	if (!bb->vm_ops)
 		return 0;
 
 	if (!sysfs_get_active(attr_sd))
 		return -EINVAL;
 
-	ret = bb->vm_ops->set_policy(vma, new);
+	ret = 0;
+	if (bb->vm_ops->set_policy)
+		ret = bb->vm_ops->set_policy(vma, new);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -279,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct mempolicy *pol;
 
-	if (!bb->vm_ops || !bb->vm_ops->get_policy)
+	if (!bb->vm_ops)
 		return vma->vm_policy;
 
 	if (!sysfs_get_active(attr_sd))
 		return vma->vm_policy;
 
-	pol = bb->vm_ops->get_policy(vma, addr);
+	pol = vma->vm_policy;
+	if (bb->vm_ops->get_policy)
+		pol = bb->vm_ops->get_policy(vma, addr);
 
 	sysfs_put_active(attr_sd);
 	return pol;
@@ -299,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->migrate)
+	if (!bb->vm_ops)
 		return 0;
 
 	if (!sysfs_get_active(attr_sd))
 		return 0;
 
-	ret = bb->vm_ops->migrate(vma, from, to, flags);
+	ret = 0;
+	if (bb->vm_ops->migrate)
+		ret = bb->vm_ops->migrate(vma, from, to, flags);
 
 	sysfs_put_active(attr_sd);
 	return ret;
-- 
cgit v1.2.3


From f4a3e0bceb57466c31757f25e4e0ed108d1299ec Mon Sep 17 00:00:00 2001
From: "Dr. Werner Fink" <werner@suse.de>
Date: Wed, 22 Sep 2010 12:45:40 +0200
Subject: tty: Add a new file /proc/tty/consoles

Add a new file /proc/tty/consoles to be able to determine the registered
system console lines.  If the reading process holds /dev/console open at
the regular standard input stream the active device will be marked by an
asterisk.  Show possible operations and also decode the used flags of
the listed console lines.

Signed-off-by: Werner Fink <werner@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/proc.txt |  32 ++++++++
 fs/proc/proc_tty.c                 | 158 +++++++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a6aca8740883..98223a676940 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1075,6 +1075,7 @@ Table 1-11: Files in /proc/tty
  drivers       list of drivers and their usage                
  ldiscs        registered line disciplines                    
  driver/serial usage statistic and status of single tty lines 
+ consoles      registered system console lines
 ..............................................................................
 
 To see  which  tty's  are  currently in use, you can simply look into the file
@@ -1093,6 +1094,37 @@ To see  which  tty's  are  currently in use, you can simply look into the file
   /dev/tty             /dev/tty        5       0 system:/dev/tty 
   unknown              /dev/tty        4    1-63 console 
 
+To see which character device lines are currently used for the system console
+/dev/console, you may simply look into the file /proc/tty/consoles:
+
+  > cat /proc/tty/consoles
+  tty0                 -WU (ECp)       4:7
+  ttyS0                -W- (Ep)        4:64
+
+The columns are:
+
+  device               name of the device
+  operations           R = can do read operations
+                       W = can do write operations
+                       U = can do unblank
+  flags                E = it is enabled
+                       C = it is prefered console
+                       B = it is primary boot console
+                       p = it is used for printk buffer
+                       b = it is not a TTY but a Braille device
+                       a = it is safe to use when cpu is offline
+                       * = it is standard input of the reading process
+  major:minor          major and minor number of the device separated by a colon
+
+If the reading process holds /dev/console open at the regular standard input
+stream the active device will be marked by an asterisk:
+
+  > cat /proc/tty/consoles < /dev/console
+  tty0                 -WU (ECp*)      4:7
+  ttyS0                -W- (Ep)        4:64
+  > tty
+  /dev/pts/3
+
 
 1.8 Miscellaneous kernel statistics in /proc/stat
 -------------------------------------------------
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc869437..dc44f94022f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -12,7 +12,10 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/console.h>
 #include <linux/seq_file.h>
+#include <linux/fdtable.h>
 #include <linux/bitops.h>
 
 /*
@@ -136,6 +139,160 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
+/*
+ * The device ID of file descriptor 0 of the current reading
+ * task if a character device...
+ */
+static dev_t current_dev;
+
+/*
+ * This is the handler for /proc/tty/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+	const struct tty_driver *driver;
+	struct console *con;
+	int index, len;
+	char flags[10];
+	dev_t dev;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+	con = (struct console *)v;
+	if (!con)
+		return 0;
+	driver = con->device(con, &index);
+	if (!driver)
+		return 0;
+	dev = MKDEV(driver->major, driver->minor_start) + index;
+
+	index = 0;
+	if (con->flags & CON_ENABLED)
+		flags[index++] = 'E';
+	if (con->flags & CON_CONSDEV)
+		flags[index++] = 'C';
+	if (con->flags & CON_BOOT)
+		flags[index++] = 'B';
+	if (con->flags & CON_PRINTBUFFER)
+		flags[index++] = 'p';
+	if (con->flags & CON_BRL)
+		flags[index++] = 'b';
+	if (con->flags & CON_ANYTIME)
+		flags[index++] = 'a';
+	if (current_dev == dev)
+		flags[index++] = '*';
+	flags[index] = 0;
+
+	seq_printf(m, "%s%d%n", con->name, con->index, &len);
+	len = 21 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c", len, ' ');
+	seq_printf(m, "%c%c%c (%s)%n", con->read ? 'R' : '-',
+			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+			flags, &len);
+	len = 13 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c%4d:%d\n", len, ' ', MAJOR(dev), MINOR(dev));
+
+	return 0;
+}
+
+/* iterator for consoles */
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	struct console *con;
+	loff_t off = 0;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	acquire_console_sem();
+	for (con = console_drivers; con; con = con->next) {
+		if (!con->device)
+			continue;
+		if (++off == *pos)
+			break;
+	}
+	release_console_sem();
+
+	return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct console *con;
+
+	acquire_console_sem();
+	if (v == SEQ_START_TOKEN)
+		con = console_drivers;
+	else
+		con = ((struct console *)v)->next;
+	for (; con; con = con->next) {
+		if (!con->device)
+			continue;
+		++*pos;
+		break;
+	}
+	release_console_sem();
+
+	return con;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations tty_consoles_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_console_dev
+};
+
+/*
+ * Used for open /proc/tty/consoles. Before this detect
+ * the device ID of file descriptor 0 of the current
+ * reading task if a character device...
+ */
+static int tty_consoles_open(struct inode *inode, struct file *file)
+{
+	struct files_struct *curfiles;
+
+	current_dev = 0;
+	curfiles = get_files_struct(current);
+	if (curfiles) {
+		const struct file *curfp;
+		spin_lock(&curfiles->file_lock);
+		curfp = fcheck_files(curfiles, 0);
+		if (curfp && curfp->private_data) {
+			const struct inode *inode;
+			dget(curfp->f_dentry);
+			inode = curfp->f_dentry->d_inode;
+			if (S_ISCHR(inode->i_mode)) {
+				struct tty_struct *tty;
+				tty = (struct tty_struct *)curfp->private_data;
+				if (tty && tty->magic == TTY_MAGIC) {
+					tty = tty_pair_get_tty(tty);
+					current_dev = tty_devnum(tty);
+				}
+			}
+			dput(curfp->f_dentry);
+		}
+		spin_unlock(&curfiles->file_lock);
+		put_files_struct(curfiles);
+	}
+	return seq_open(file, &tty_consoles_op);
+}
+
+static const struct file_operations proc_tty_consoles_operations = {
+	.open		= tty_consoles_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -186,4 +343,5 @@ void __init proc_tty_init(void)
 	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
 	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
 	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+	proc_create("tty/consoles", 0, NULL, &proc_tty_consoles_operations);
 }
-- 
cgit v1.2.3


From a1f765061e1491d5ec467429d0d6adfd9df2f6d9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 16 Sep 2010 14:29:55 -0400
Subject: Btrfs: stop trying to shrink delalloc if there are no inodes to
 reclaim

In very severe ENOSPC cases we can run out of inodes to do delalloc on, which
means we'll just keep looping trying to shrink delalloc.  Instead, if we fail to
shrink delalloc 3 times in a row break out since we're not likely to make any
progress.  Tested this with a 100mb fs an xfstests test 13.  Before the patch it
would hang the box, with the patch we get -ENOSPC like we should.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..c6a5d9095d5f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3115,6 +3115,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
+	int no_reclaim = 0;
 	int pause = 1;
 	int ret;
 
@@ -3131,12 +3132,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	while (1) {
 		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
 		if (!ret) {
+			if (no_reclaim > 2)
+				break;
+			no_reclaim++;
 			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(pause);
 			pause <<= 1;
 			if (pause > HZ / 10)
 				pause = HZ / 10;
 		} else {
+			no_reclaim = 0;
 			pause = 1;
 		}
 
-- 
cgit v1.2.3


From bf5fc093c5b625e4259203f1cee7ca73488a5620 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 29 Sep 2010 11:22:36 -0400
Subject: Btrfs: fix the df ioctl to report raid types

The new ENOSPC stuff broke the df ioctl since we no longer create seperate space
info's for each RAID type.  So instead, loop through each space info's raid
lists so we can get the right RAID information which will allow the df ioctl to
tell us RAID types again.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ioctl.c | 100 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58dbe..db0b8fc59235 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1879,6 +1879,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	return 0;
 }
 
+static void get_block_group_info(struct list_head *groups_list,
+				 struct btrfs_ioctl_space_info *space)
+{
+	struct btrfs_block_group_cache *block_group;
+
+	space->total_bytes = 0;
+	space->used_bytes = 0;
+	space->flags = 0;
+	list_for_each_entry(block_group, groups_list, list) {
+		space->flags = block_group->flags;
+		space->total_bytes += block_group->key.offset;
+		space->used_bytes +=
+			btrfs_block_group_used(&block_group->item);
+	}
+}
+
 long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_space_args space_args;
@@ -1887,27 +1903,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	struct btrfs_ioctl_space_info *dest_orig;
 	struct btrfs_ioctl_space_info *user_dest;
 	struct btrfs_space_info *info;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
 	int alloc_size;
 	int ret = 0;
 	int slot_count = 0;
+	int i, c;
 
 	if (copy_from_user(&space_args,
 			   (struct btrfs_ioctl_space_args __user *)arg,
 			   sizeof(space_args)))
 		return -EFAULT;
 
-	/* first we count slots */
-	rcu_read_lock();
-	list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
-		slot_count++;
-	rcu_read_unlock();
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!info)
+			continue;
+
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c]))
+				slot_count++;
+		}
+		up_read(&info->groups_sem);
+	}
 
 	/* space_slots == 0 means they are asking for a count */
 	if (space_args.space_slots == 0) {
 		space_args.total_spaces = slot_count;
 		goto out;
 	}
+
+	slot_count = min_t(int, space_args.space_slots, slot_count);
+
 	alloc_size = sizeof(*dest) * slot_count;
+
 	/* we generally have at most 6 or so space infos, one for each raid
 	 * level.  So, a whole page should be more than enough for everyone
 	 */
@@ -1921,27 +1966,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	dest_orig = dest;
 
 	/* now we have a buffer to copy into */
-	rcu_read_lock();
-	list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
-		/* make sure we don't copy more than we allocated
-		 * in our buffer
-		 */
-		if (slot_count == 0)
-			break;
-		slot_count--;
-
-		/* make sure userland has enough room in their buffer */
-		if (space_args.total_spaces >= space_args.space_slots)
-			break;
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
 
-		space.flags = info->flags;
-		space.total_bytes = info->total_bytes;
-		space.used_bytes = info->bytes_used;
-		memcpy(dest, &space, sizeof(space));
-		dest++;
-		space_args.total_spaces++;
+		if (!info)
+			continue;
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c])) {
+				get_block_group_info(&info->block_groups[c],
+						     &space);
+				memcpy(dest, &space, sizeof(space));
+				dest++;
+				space_args.total_spaces++;
+			}
+		}
+		up_read(&info->groups_sem);
 	}
-	rcu_read_unlock();
 
 	user_dest = (struct btrfs_ioctl_space_info *)
 		(arg + sizeof(struct btrfs_ioctl_space_args));
-- 
cgit v1.2.3


From 89a55897a2fbbceb94480952784004bf23911d38 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 14 Oct 2010 14:52:27 -0400
Subject: Btrfs: fix df regression

The new ENOSPC stuff breaks out the raid types which breaks the way we were
reporting df to the system.  This fixes it back so that Available is the total
space available to data and used is the actual bytes used by the filesystem.
This means that Available is Total - data used - all of the metadata space.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  5 ++++-
 fs/btrfs/extent-tree.c | 10 ++++++++++
 fs/btrfs/super.c       | 11 +++++++++--
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 29c20092847e..014fd52c01bf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,7 +675,8 @@ struct btrfs_block_group_item {
 struct btrfs_space_info {
 	u64 flags;
 
-	u64 total_bytes;	/* total bytes in the space */
+	u64 total_bytes;	/* total bytes in the space,
+				   this doesn't take mirrors into account */
 	u64 bytes_used;		/* total bytes used,
 				   this does't take mirrors into account */
 	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
@@ -687,6 +688,8 @@ struct btrfs_space_info {
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc/allocations */
 	u64 disk_used;		/* total bytes used on disk */
+	u64 disk_total;		/* total bytes on disk, takes mirrors into
+				   account */
 
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a5d9095d5f..4669c6f8a44d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2763,6 +2763,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (found) {
 		spin_lock(&found->lock);
 		found->total_bytes += total_bytes;
+		found->disk_total += total_bytes * factor;
 		found->bytes_used += bytes_used;
 		found->disk_used += bytes_used * factor;
 		found->full = 0;
@@ -2782,6 +2783,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 				BTRFS_BLOCK_GROUP_SYSTEM |
 				BTRFS_BLOCK_GROUP_METADATA);
 	found->total_bytes = total_bytes;
+	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
 	found->disk_used = bytes_used * factor;
 	found->bytes_pinned = 0;
@@ -8095,6 +8097,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_free_cluster *cluster;
 	struct btrfs_key key;
 	int ret;
+	int factor;
 
 	root = root->fs_info->extent_root;
 
@@ -8103,6 +8106,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!block_group->ro);
 
 	memcpy(&key, &block_group->key, sizeof(key));
+	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+				  BTRFS_BLOCK_GROUP_RAID1 |
+				  BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
 
 	/* make sure this block group isn't part of an allocation cluster */
 	cluster = &root->fs_info->data_alloc_cluster;
@@ -8143,6 +8152,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_lock(&block_group->space_info->lock);
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	block_group->space_info->disk_total -= block_group->key.offset * factor;
 	spin_unlock(&block_group->space_info->lock);
 
 	btrfs_clear_space_info_full(root->fs_info);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f2393b390318..afab6ca14d03 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -716,18 +716,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
+	u64 total_used_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
 	__be32 *fsid = (__be32 *)root->fs_info->fsid;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(found, head, list)
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+				    BTRFS_BLOCK_GROUP_SYSTEM))
+			total_used_data += found->disk_total;
+		else
+			total_used_data += found->disk_used;
 		total_used += found->disk_used;
+	}
 	rcu_read_unlock();
 
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
 	buf->f_bfree = buf->f_blocks - (total_used >> bits);
-	buf->f_bavail = buf->f_bfree;
+	buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 
-- 
cgit v1.2.3


From 6d48755d02b150de7f47e7b4753202f2fc9f990f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 15 Oct 2010 15:13:32 -0400
Subject: Btrfs: fix reservation code for mixed block groups

The global reservation stuff tries to add together DATA and METADATA used in
order to figure out how much to reserve for everything, but this doesn't work
right for mixed block groups.  Instead if we have mixed block groups just set
data used to 0.  Also with mixed block groups we will use bytes_may_use for
keeping track of delalloc bytes, so we need to take that into account in our
reservation calculations.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4669c6f8a44d..0f27f7b48804 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3213,7 +3213,8 @@ static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
 
 	spin_lock(&space_info->lock);
 	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly;
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
 
 	if (unused < space_info->total_bytes)
 		unused = space_info->total_bytes - unused;
@@ -3507,6 +3508,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	spin_lock(&sinfo->lock);
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+		data_used = 0;
 	meta_used = sinfo->bytes_used;
 	spin_unlock(&sinfo->lock);
 
@@ -3534,7 +3537,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	block_rsv->size = num_bytes;
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-		    sinfo->bytes_reserved + sinfo->bytes_readonly;
+		    sinfo->bytes_reserved + sinfo->bytes_readonly +
+		    sinfo->bytes_may_use;
 
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
-- 
cgit v1.2.3


From 0019f10db6f596f3e14a19f9bd7059a1b85b0853 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 15 Oct 2010 15:18:40 -0400
Subject: Btrfs: re-work delalloc flushing

Currently we try and flush delalloc, but we only do that in a sort of weak way,
which works fine in most cases but if we're under heavy pressure we need to be
able to wait for flushing to happen.  Also instead of checking the bytes
reserved in the block_rsv, check the space info since it is more accurate.  The
sync option will be used in a future patch.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/extent-tree.c | 26 ++++++++++++++------------
 fs/btrfs/inode.c       | 24 ++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 014fd52c01bf..f32404db2c5d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2376,7 +2376,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0f27f7b48804..2846cebc9427 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3111,9 +3111,10 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim)
+			   struct btrfs_root *root, u64 to_reclaim, int sync)
 {
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_space_info *space_info;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
@@ -3122,9 +3123,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	int ret;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
-	spin_lock(&block_rsv->lock);
-	reserved = block_rsv->reserved;
-	spin_unlock(&block_rsv->lock);
+	space_info = block_rsv->space_info;
+	spin_lock(&space_info->lock);
+	reserved = space_info->bytes_reserved;
+	spin_unlock(&space_info->lock);
 
 	if (reserved == 0)
 		return 0;
@@ -3132,7 +3134,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	max_reclaim = min(reserved, to_reclaim);
 
 	while (1) {
-		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync);
 		if (!ret) {
 			if (no_reclaim > 2)
 				break;
@@ -3147,11 +3149,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 			pause = 1;
 		}
 
-		spin_lock(&block_rsv->lock);
-		if (reserved > block_rsv->reserved)
-			reclaimed = reserved - block_rsv->reserved;
-		reserved = block_rsv->reserved;
-		spin_unlock(&block_rsv->lock);
+		spin_lock(&space_info->lock);
+		if (reserved > space_info->bytes_reserved)
+			reclaimed += reserved - space_info->bytes_reserved;
+		reserved = space_info->bytes_reserved;
+		spin_unlock(&space_info->lock);
 
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
@@ -3180,7 +3182,7 @@ static int should_retry_reserve(struct btrfs_trans_handle *trans,
 	if (trans && trans->transaction->in_commit)
 		return -ENOSPC;
 
-	ret = shrink_delalloc(trans, root, num_bytes);
+	ret = shrink_delalloc(trans, root, num_bytes, 0);
 	if (ret)
 		return ret;
 
@@ -3729,7 +3731,7 @@ again:
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	if (block_rsv->size > 512 * 1024 * 1024)
-		shrink_delalloc(NULL, root, to_reserve);
+		shrink_delalloc(NULL, root, to_reserve, 0);
 
 	return 0;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad4744..5f9e4fc20a73 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6603,7 +6603,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return 0;
 }
 
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode = NULL;
@@ -6625,7 +6626,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
 	spin_unlock(&root->fs_info->delalloc_lock);
 
 	if (inode) {
-		write_inode_now(inode, 0);
+		if (sync) {
+			filemap_write_and_wait(inode->i_mapping);
+			/*
+			 * We have to do this because compression doesn't
+			 * actually set PG_writeback until it submits the pages
+			 * for IO, which happens in an async thread, so we could
+			 * race and not actually wait for any writeback pages
+			 * because they've not been submitted yet.  Technically
+			 * this could still be the case for the ordered stuff
+			 * since the async thread may not have started to do its
+			 * work yet.  If this becomes the case then we need to
+			 * figure out a way to make sure that in writepage we
+			 * wait for any async pages to be submitted before
+			 * returning so that fdatawait does what its supposed to
+			 * do.
+			 */
+			btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		} else {
+			filemap_flush(inode->i_mapping);
+		}
 		if (delay_iput)
 			btrfs_add_delayed_iput(inode);
 		else
-- 
cgit v1.2.3


From 14ed0ca6e8236f2d264c4a8faec9e3a2b3d04377 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 15 Oct 2010 15:23:48 -0400
Subject: Btrfs: don't allocate chunks as aggressively

Because the ENOSPC code over reserves super aggressively we end up allocating
chunks way more often than we should.  For example with my fs_mark tests on a
2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is
being used.  So instead check to see if the amount of space actually used is
less than 30% of the total space, and if so don't allocate a chunk, but only if
we have at least 256mb of free space to make sure we don't put too much pressure
on free space.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2846cebc9427..aca3314ef8b9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3000,8 +3000,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
-			      u64 alloc_bytes)
+static int should_alloc_chunk(struct btrfs_space_info *sinfo, u64 alloc_bytes)
 {
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
 
@@ -3013,6 +3012,10 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
 	    alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
+	if (num_bytes > 256 * 1024 * 1024 &&
+	    sinfo->bytes_used < div_factor(num_bytes, 3))
+		return 0;
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 8bb8ab2e93f9c3c9453e13be0f37d344a32a3a6d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 15 Oct 2010 16:52:49 -0400
Subject: Btrfs: rework how we reserve metadata bytes

With multi-threaded writes we were getting ENOSPC early because somebody would
come in, start flushing delalloc because they couldn't make their reservation,
and in the meantime other threads would come in and use the space that was
getting freed up, so when the original thread went to check to see if they had
space they didn't and they'd return ENOSPC.  So instead if we have some free
space but not enough for our reservation, take the reservation and then start
doing the flushing.  The only time we don't take reservations is when we've
already overcommitted our space, that way we don't have people who come late to
the party way overcommitting ourselves.  This also moves all of the retrying and
flushing code into reserve_metdata_bytes so it's all uniform.  This keeps my
fs_mark test from returning -ENOSPC as soon as it starts and actually lets me
fill up the disk.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |   4 +-
 fs/btrfs/extent-tree.c | 238 +++++++++++++++++++++++++++----------------------
 fs/btrfs/relocation.c  |  14 +--
 fs/btrfs/transaction.c |   7 +-
 4 files changed, 136 insertions(+), 127 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f32404db2c5d..47bc66e34da7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2082,7 +2082,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				int num_items, int *retries);
+				int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2103,7 +2103,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries);
+			u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aca3314ef8b9..180a50146ddf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3078,38 +3078,6 @@ out:
 	return ret;
 }
 
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_space_info *sinfo, u64 num_bytes)
-{
-	int ret;
-	int end_trans = 0;
-
-	if (sinfo->full)
-		return 0;
-
-	spin_lock(&sinfo->lock);
-	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-	spin_unlock(&sinfo->lock);
-	if (!ret)
-		return 0;
-
-	if (!trans) {
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		end_trans = 1;
-	}
-
-	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-			     num_bytes + 2 * 1024 * 1024,
-			     get_alloc_profile(root, sinfo->flags), 0);
-
-	if (end_trans)
-		btrfs_end_transaction(trans, root);
-
-	return ret == 1 ? 1 : 0;
-}
-
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3167,79 +3135,138 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes, int *retries)
+/*
+ * Retries tells us how many times we've called reserve_metadata_bytes.  The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space.  That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	int ret;
+	u64 unused;
+	u64 num_bytes = orig_bytes;
+	int retries = 0;
+	int ret = 0;
+	bool reserved = false;
 
-	if ((*retries) > 2)
-		return -ENOSPC;
+again:
+	ret = -ENOSPC;
+	if (reserved)
+		num_bytes = 0;
 
-	ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
-	if (ret)
-		return 1;
+	spin_lock(&space_info->lock);
+	unused = space_info->bytes_used + space_info->bytes_reserved +
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
 
-	if (trans && trans->transaction->in_commit)
-		return -ENOSPC;
+	/*
+	 * The idea here is that we've not already over-reserved the block group
+	 * then we can go ahead and save our reservation first and then start
+	 * flushing if we need to.  Otherwise if we've already overcommitted
+	 * lets start flushing stuff first and then come back and try to make
+	 * our reservation.
+	 */
+	if (unused <= space_info->total_bytes) {
+		unused -= space_info->total_bytes;
+		if (unused >= num_bytes) {
+			if (!reserved)
+				space_info->bytes_reserved += orig_bytes;
+			ret = 0;
+		} else {
+			/*
+			 * Ok set num_bytes to orig_bytes since we aren't
+			 * overocmmitted, this way we only try and reclaim what
+			 * we need.
+			 */
+			num_bytes = orig_bytes;
+		}
+	} else {
+		/*
+		 * Ok we're over committed, set num_bytes to the overcommitted
+		 * amount plus the amount of bytes that we need for this
+		 * reservation.
+		 */
+		num_bytes = unused - space_info->total_bytes +
+			(orig_bytes * (retries + 1));
+	}
 
-	ret = shrink_delalloc(trans, root, num_bytes, 0);
-	if (ret)
-		return ret;
+	/*
+	 * Couldn't make our reservation, save our place so while we're trying
+	 * to reclaim space we can actually use it instead of somebody else
+	 * stealing it from us.
+	 */
+	if (ret && !reserved) {
+		space_info->bytes_reserved += orig_bytes;
+		reserved = true;
+	}
 
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < num_bytes)
-		ret = 1;
 	spin_unlock(&space_info->lock);
-	if (ret)
-		return -ENOSPC;
-
-	(*retries)++;
 
-	if (trans)
-		return -EAGAIN;
+	if (!ret)
+		return 0;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+	if (!flush)
+		goto out;
 
-	return 1;
-}
+	/*
+	 * We do synchronous shrinking since we don't actually unreserve
+	 * metadata until after the IO is completed.
+	 */
+	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	if (ret > 0)
+		return 0;
+	else if (ret < 0)
+		goto out;
 
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
-				  u64 num_bytes)
-{
-	struct btrfs_space_info *space_info = block_rsv->space_info;
-	u64 unused;
-	int ret = -ENOSPC;
+	/*
+	 * So if we were overcommitted it's possible that somebody else flushed
+	 * out enough space and we simply didn't have enough space to reclaim,
+	 * so go back around and try again.
+	 */
+	if (retries < 2) {
+		retries++;
+		goto again;
+	}
 
 	spin_lock(&space_info->lock);
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	/*
+	 * Not enough space to be reclaimed, don't bother committing the
+	 * transaction.
+	 */
+	if (space_info->bytes_pinned < orig_bytes)
+		ret = -ENOSPC;
+	spin_unlock(&space_info->lock);
+	if (ret)
+		goto out;
 
-	if (unused < space_info->total_bytes)
-		unused = space_info->total_bytes - unused;
-	else
-		unused = 0;
+	ret = -EAGAIN;
+	if (trans)
+		goto out;
 
-	if (unused >= num_bytes) {
-		if (block_rsv->priority >= 10) {
-			space_info->bytes_reserved += num_bytes;
-			ret = 0;
-		} else {
-			if ((unused + block_rsv->reserved) *
-			    block_rsv->priority >=
-			    (num_bytes + block_rsv->reserved) * 10) {
-				space_info->bytes_reserved += num_bytes;
-				ret = 0;
-			}
-		}
+
+	ret = -ENOSPC;
+	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		goto out;
+	ret = btrfs_commit_transaction(trans, root);
+	if (!ret)
+		goto again;
+
+out:
+	if (reserved) {
+		spin_lock(&space_info->lock);
+		space_info->bytes_reserved -= orig_bytes;
+		spin_unlock(&space_info->lock);
 	}
-	spin_unlock(&space_info->lock);
 
 	return ret;
 }
@@ -3383,23 +3410,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries)
+			u64 num_bytes)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
-again:
-	ret = reserve_metadata_bytes(block_rsv, num_bytes);
+
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
 	}
 
-	ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-	if (ret > 0)
-		goto again;
-
 	return ret;
 }
 
@@ -3434,7 +3457,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 
 	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(block_rsv, num_bytes);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     num_bytes, 0);
 		if (!ret) {
 			block_rsv_add_bytes(block_rsv, num_bytes, 0);
 			return 0;
@@ -3614,7 +3638,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
 
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 int num_items, int *retries)
+				 int num_items)
 {
 	u64 num_bytes;
 	int ret;
@@ -3624,7 +3648,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 
 	num_bytes = calc_trans_metadata_size(root, num_items);
 	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-				  num_bytes, retries);
+				  num_bytes);
 	if (!ret) {
 		trans->bytes_reserved += num_bytes;
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3698,14 +3722,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
 	u64 to_reserve;
 	int nr_extents;
-	int retries = 0;
 	int ret;
 
 	if (btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
+
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
 	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3715,18 +3738,14 @@ again:
 		nr_extents = 0;
 		to_reserve = 0;
 	}
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
-	ret = reserve_metadata_bytes(block_rsv, to_reserve);
-	if (ret) {
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-					   &retries);
-		if (ret > 0)
-			goto again;
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	if (ret)
 		return ret;
-	}
 
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -5325,7 +5344,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(block_rsv, blocksize);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     blocksize, 0);
 		if (ret)
 			return ERR_PTR(ret);
 		return block_rsv;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4a..39adb68a653f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -178,8 +178,6 @@ struct reloc_control {
 	u64 search_start;
 	u64 extents_found;
 
-	int block_rsv_retries;
-
 	unsigned int stage:8;
 	unsigned int create_reloc_tree:1;
 	unsigned int merge_reloc_tree:1;
@@ -2133,7 +2131,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 	LIST_HEAD(reloc_roots);
 	u64 num_bytes = 0;
 	int ret;
-	int retries = 0;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2143,7 +2140,7 @@ again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
 		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes, &retries);
+					  num_bytes);
 		if (ret)
 			err = ret;
 	}
@@ -2155,7 +2152,6 @@ again:
 			btrfs_end_transaction(trans, rc->extent_root);
 			btrfs_block_rsv_release(rc->extent_root,
 						rc->block_rsv, num_bytes);
-			retries = 0;
 			goto again;
 		}
 	}
@@ -2405,15 +2401,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-				  &rc->block_rsv_retries);
+	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
 		return ret;
 	}
 
-	rc->block_rsv_retries = 0;
 	return 0;
 }
 
@@ -3554,8 +3548,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * is no reservation in transaction handle.
 	 */
 	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256,
-				  &rc->block_rsv_retries);
+				  rc->extent_root->nodesize * 256);
 	if (ret)
 		return ret;
 
@@ -3567,7 +3560,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 	rc->extents_found = 0;
 	rc->nodes_relocated = 0;
 	rc->merging_rsv_size = 0;
-	rc->block_rsv_retries = 0;
 
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63b..abbec80aaa44 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -179,7 +179,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
-	int retries = 0;
 	int ret;
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -212,8 +211,7 @@ again:
 	}
 
 	if (num_items > 0) {
-		ret = btrfs_trans_reserve_metadata(h, root, num_items,
-						   &retries);
+		ret = btrfs_trans_reserve_metadata(h, root, num_items);
 		if (ret == -EAGAIN) {
 			btrfs_commit_transaction(h, root);
 			goto again;
@@ -836,7 +834,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
-	int retries = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
@@ -858,7 +855,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	if (to_reserve > 0) {
 		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-					  to_reserve, &retries);
+					  to_reserve);
 		if (ret) {
 			pending->error = ret;
 			goto fail;
-- 
cgit v1.2.3


From 0e78340f3c1fc603e8016c8ac304766bcc65506e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 22 Oct 2010 15:26:53 -0400
Subject: Btrfs: fix error handling in btrfs_get_sb

If we failed to find the root subvol id, or the subvol=<name>, we would
deactivate the locked super and close the devices.  The problem is at this point
we have gotten the SB all setup, which includes setting super_operations, so
when we'd deactiveate the super, we'd do a close_ctree() which closes the
devices, so we'd end up closing the devices twice.  So if you do something like
this

mount /dev/sda1 /mnt/test1
mount /dev/sda1 /mnt/test2 -o subvol=xxx
umount /mnt/test1

it would blow up (if subvol xxx doesn't exist).  This patch fixes that problem.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index afab6ca14d03..d1867cda92a7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -629,7 +629,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(root)) {
 		error = PTR_ERR(root);
 		deactivate_locked_super(s);
-		goto error;
+		goto error_free_subvol_name;
 	}
 	/* if they gave us a subvolume name bind mount into that */
 	if (strcmp(subvol_name, ".")) {
@@ -643,14 +643,14 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			deactivate_locked_super(s);
 			error = PTR_ERR(new_root);
 			dput(root);
-			goto error_close_devices;
+			goto error_free_subvol_name;
 		}
 		if (!new_root->d_inode) {
 			dput(root);
 			dput(new_root);
 			deactivate_locked_super(s);
 			error = -ENXIO;
-			goto error_close_devices;
+			goto error_free_subvol_name;
 		}
 		dput(root);
 		root = new_root;
@@ -668,7 +668,6 @@ error_close_devices:
 	btrfs_close_devices(fs_devices);
 error_free_subvol_name:
 	kfree(subvol_name);
-error:
 	return error;
 }
 
-- 
cgit v1.2.3


From 9566a7a851eb7201e3207eab53ee81efd0850fee Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Tue, 10 Aug 2010 00:58:41 +0900
Subject: nilfs2: accept future revisions

Compatibility of nilfs partitions is now managed with three feature
sets.  This changes old compatibility check with revision number so
that it can accept future revisions.

Note that we can stop support of experimental versions of nilfs that
doesn't know the feature sets by incrementing NILFS_CURRENT_REV.  We
don't have to do it soon, but it would be a possible option whenever
the need arises.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c     | 4 ++--
 include/linux/nilfs2_fs.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917fc..461b7211e14f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -468,8 +468,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
 static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 				   struct nilfs_super_block *sbp)
 {
-	if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
-		printk(KERN_ERR "NILFS: revision mismatch "
+	if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
+		printk(KERN_ERR "NILFS: unsupported revision "
 		       "(superblock rev.=%d.%d, current rev.=%d.%d). "
 		       "Please check the version of mkfs.nilfs.\n",
 		       le32_to_cpu(sbp->s_rev_level),
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index f5487b6f91ed..b07f5cdff5e2 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -229,6 +229,7 @@ struct nilfs_super_block {
  */
 #define NILFS_CURRENT_REV	2	/* current major revision */
 #define NILFS_MINOR_REV		0	/* minor revision */
+#define NILFS_MIN_SUPP_REV	2	/* minimum supported revision */
 
 /*
  * Feature set definitions
-- 
cgit v1.2.3


From b91c9a97c9333c87fe2a0c94b3b22b24df1c5fc2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 20 Aug 2010 23:46:06 +0900
Subject: nilfs2: allow nilfs_destroy_inode to destroy metadata file inodes

The current nilfs_destroy_inode() doesn't handle metadata file inodes
including gc inodes (dummy inodes used for garbage collection).

This allows nilfs_destroy_inode() to destroy inodes of metadata files.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c   | 2 --
 fs/nilfs2/super.c | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d01aff4957d9..ee943a342e1d 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -577,7 +577,5 @@ void nilfs_mdt_destroy(struct inode *inode)
 		nilfs_palloc_destroy_cache(inode);
 	nilfs_mdt_clear(inode);
 
-	kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
-	kfree(mdi);
 	nilfs_destroy_inode(inode);
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 9f4913f78408..51576b4dbf7a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -167,6 +167,12 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 
 void nilfs_destroy_inode(struct inode *inode)
 {
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	if (mdi) {
+		kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+		kfree(mdi);
+	}
 	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 
-- 
cgit v1.2.3


From 7d6cd92fe2de3c286b8e95f43223f09db9302635 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 21 Aug 2010 00:30:39 +0900
Subject: nilfs2: allow nilfs_dirty_inode to mark metadata file inodes dirty

This allows sop->dirty_inode callback function (nilfs_dirty_inode) to
handle metadata file inodes.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..f1750caa362c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -808,6 +808,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 void nilfs_dirty_inode(struct inode *inode)
 {
 	struct nilfs_transaction_info ti;
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
 	if (is_bad_inode(inode)) {
 		nilfs_warning(inode->i_sb, __func__,
@@ -815,6 +816,10 @@ void nilfs_dirty_inode(struct inode *inode)
 		dump_stack();
 		return;
 	}
+	if (mdi) {
+		nilfs_mdt_mark_dirty(inode);
+		return;
+	}
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	nilfs_mark_inode_dirty(inode);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
-- 
cgit v1.2.3


From 6c43f41000312fefa482c3bfdd97e7f81d6be0ec Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 20 Aug 2010 20:10:38 +0900
Subject: nilfs2: keep zero value in i_cno except for gc-inodes

On-memory inode structures of nilfs have a member "i_cno" which stores
a checkpoint number related to the inode.  For gc-inodes, this field
indicates version of data each gc-inode caches for GC.  Log writer
temporarily uses "i_cno" to transfer the latest checkpoint number.

This stops the latter use and lets only gc-inodes use it.

The purpose of this patch is to allow the successive change use
"i_cno" for inode lookup.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c       | 29 ++++++++++++++---------------
 fs/nilfs2/segment.h       |  3 ++-
 include/linux/nilfs2_fs.h |  8 ++++++++
 3 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9fd051a33c4f..eee4b223c293 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -366,8 +366,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
 
 	if (nilfs_doing_gc())
 		flags = NILFS_SS_GC;
-	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
-				 sci->sc_sbi->s_nilfs->ns_cno);
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
 	if (unlikely(err))
 		return err;
 
@@ -440,17 +439,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
 	struct nilfs_finfo *finfo;
 	struct nilfs_inode_info *ii;
 	struct nilfs_segment_buffer *segbuf;
+	__u64 cno;
 
 	if (sci->sc_blk_cnt == 0)
 		return;
 
 	ii = NILFS_I(inode);
+
+	if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+		cno = ii->i_cno;
+	else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
+		cno = 0;
+	else
+		cno = sci->sc_cno;
+
 	finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
 						 sizeof(*finfo));
 	finfo->fi_ino = cpu_to_le64(inode->i_ino);
 	finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
 	finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
-	finfo->fi_cno = cpu_to_le64(ii->i_cno);
+	finfo->fi_cno = cpu_to_le64(cno);
 
 	segbuf = sci->sc_curseg;
 	segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -1976,7 +1984,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 					struct nilfs_sb_info *sbi)
 {
 	struct nilfs_inode_info *ii, *n;
-	__u64 cno = sbi->s_nilfs->ns_cno;
 
 	spin_lock(&sbi->s_inode_lock);
  retry:
@@ -2002,7 +2009,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 				brelse(ibh);
 			goto retry;
 		}
-		ii->i_cno = cno;
 
 		clear_bit(NILFS_I_QUEUED, &ii->i_state);
 		set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2011,8 +2017,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 	}
 	spin_unlock(&sbi->s_inode_lock);
 
-	NILFS_I(sbi->s_ifile)->i_cno = cno;
-
 	return 0;
 }
 
@@ -2021,19 +2025,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
 	struct nilfs_inode_info *ii, *n;
-	__u64 cno = sbi->s_nilfs->ns_cno;
 
 	spin_lock(&sbi->s_inode_lock);
 	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
 		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
-		    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
-			/* The current checkpoint number (=nilfs->ns_cno) is
-			   changed between check-in and check-out only if the
-			   super root is written out.  So, we can update i_cno
-			   for the inodes that remain in the dirty list. */
-			ii->i_cno = cno;
+		    test_bit(NILFS_I_DIRTY, &ii->i_state))
 			continue;
-		}
+
 		clear_bit(NILFS_I_BUSY, &ii->i_state);
 		brelse(ii->i_bh);
 		ii->i_bh = NULL;
@@ -2054,6 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	int err;
 
 	sci->sc_stage.scnt = NILFS_ST_INIT;
+	sci->sc_cno = nilfs->ns_cno;
 
 	err = nilfs_segctor_check_in_files(sci, sbi);
 	if (unlikely(err))
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 17c487bd8152..675d932148a4 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -107,6 +107,7 @@ struct nilfs_segsum_pointer {
  * @sc_datablk_cnt: Data block count of a file
  * @sc_nblk_this_inc: Number of blocks included in the current logical segment
  * @sc_seg_ctime: Creation time
+ * @sc_cno: checkpoint number of current log
  * @sc_flags: Internal flags
  * @sc_state_lock: spinlock for sc_state and so on
  * @sc_state: Segctord state flags
@@ -156,7 +157,7 @@ struct nilfs_sc_info {
 	unsigned long		sc_datablk_cnt;
 	unsigned long		sc_nblk_this_inc;
 	time_t			sc_seg_ctime;
-
+	__u64			sc_cno;
 	unsigned long		sc_flags;
 
 	spinlock_t		sc_state_lock;
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index b07f5cdff5e2..bcdb34c68d08 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -270,6 +270,14 @@ struct nilfs_super_block {
 #define NILFS_MIN_NRSVSEGS	8	/* Minimum number of reserved
 					   segments */
 
+/*
+ * We call DAT, cpfile, and sufile root metadata files.  Inodes of
+ * these files are written in super root block instead of ifile, and
+ * garbage collector doesn't keep any past versions of these files.
+ */
+#define NILFS_ROOT_METADATA_FILE(ino) \
+	((ino) >= NILFS_DAT_INO && (ino) <= NILFS_SUFILE_INO)
+
 /*
  * bytes offset of secondary super block
  */
-- 
cgit v1.2.3


From 0e14a3595bddedfb27b51a6b0a29b5173aa2511a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 20 Aug 2010 21:20:29 +0900
Subject: nilfs2: use iget5_locked to get inode

This uses iget5_locked instead of iget_locked so that gc cache can
look up inodes with an inode number and an optional checkpoint number.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 37 ++++++++++++++++++++++++++++++++++---
 fs/nilfs2/super.c |  1 +
 2 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index f1750caa362c..6e9df85b5824 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,11 @@
 #include "cpfile.h"
 #include "ifile.h"
 
+struct nilfs_iget_args {
+	u64 ino;
+	__u64 cno;
+	int for_gc;
+};
 
 /**
  * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -320,7 +325,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
 	ii->i_dir_start_lookup = 0;
-	ii->i_cno = 0;
 	nilfs_set_inode_flags(inode);
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -410,7 +414,6 @@ int nilfs_read_inode_common(struct inode *inode,
 		0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
 	ii->i_dir_start_lookup = 0;
-	ii->i_cno = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 
 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -476,12 +479,40 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 	return err;
 }
 
+static int nilfs_iget_test(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+	struct nilfs_inode_info *ii;
+
+	if (args->ino != inode->i_ino)
+		return 0;
+
+	ii = NILFS_I(inode);
+	if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
+		return !args->for_gc;
+
+	return args->for_gc && args->cno == ii->i_cno;
+}
+
+static int nilfs_iget_set(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+
+	inode->i_ino = args->ino;
+	if (args->for_gc) {
+		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+		NILFS_I(inode)->i_cno = args->cno;
+	}
+	return 0;
+}
+
 struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
 {
+	struct nilfs_iget_args args = { .ino = ino, .cno = 0, .for_gc = 0 };
 	struct inode *inode;
 	int err;
 
-	inode = iget_locked(sb, ino);
+	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 51576b4dbf7a..f3a00a3b2a03 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -155,6 +155,7 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 		return NULL;
 	ii->i_bh = NULL;
 	ii->i_state = 0;
+	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
 	nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
 	return &ii->vfs_inode;
-- 
cgit v1.2.3


From 5e19a995f4ad8a8f20749a396bb01ebb6d4df96c Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 21 Aug 2010 22:01:51 +0900
Subject: nilfs2: separate initializer of metadata file inode

This separates a part of initialization code of metadata file inode,
and makes it available from the nilfs iget function that a later patch
will add to.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/gcinode.c |  6 +++++-
 fs/nilfs2/mdt.c     | 44 +++++++++++++++++++++++++++-----------------
 fs/nilfs2/mdt.h     |  4 +++-
 3 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bed3a783129b..cd19a3709bda 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -225,10 +225,14 @@ static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
 
-	inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
+	inode = nilfs_mdt_new_common(nilfs, NULL, ino);
 	if (!inode)
 		return NULL;
 
+	if (nilfs_mdt_init(inode, nilfs, GFP_NOFS, 0) < 0) {
+		nilfs_destroy_inode(inode);
+		return NULL;
+	}
 	inode->i_op = NULL;
 	inode->i_fop = NULL;
 	inode->i_mapping->a_ops = &def_gcinode_aops;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index ee943a342e1d..73e5da3b097e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -439,6 +439,27 @@ static const struct address_space_operations def_mdt_aops = {
 static const struct inode_operations def_mdt_iops;
 static const struct file_operations def_mdt_fops;
 
+
+int nilfs_mdt_init(struct inode *inode, struct the_nilfs *nilfs,
+		   gfp_t gfp_mask, size_t objsz)
+{
+	struct nilfs_mdt_info *mi;
+
+	mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
+	if (!mi)
+		return -ENOMEM;
+
+	mi->mi_nilfs = nilfs;
+	init_rwsem(&mi->mi_sem);
+	inode->i_private = mi;
+
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
+	inode->i_mapping->backing_dev_info = nilfs->ns_bdi;
+
+	return 0;
+}
+
 /*
  * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
  * ifile, or gcinodes.  This allows the B-tree code and segment constructor
@@ -454,12 +475,10 @@ static const struct file_operations def_mdt_fops;
  * @nilfs: nilfs object
  * @sb: super block instance the metadata file belongs to
  * @ino: inode number
- * @gfp_mask: gfp mask for data pages
- * @objsz: size of the private object attached to inode->i_private
  */
 struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-		     ino_t ino, gfp_t gfp_mask, size_t objsz)
+		     ino_t ino)
 {
 	struct inode *inode = nilfs_alloc_inode_common(nilfs);
 
@@ -467,15 +486,6 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 		return NULL;
 	else {
 		struct address_space * const mapping = &inode->i_data;
-		struct nilfs_mdt_info *mi;
-
-		mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
-		if (!mi) {
-			nilfs_destroy_inode(inode);
-			return NULL;
-		}
-		mi->mi_nilfs = nilfs;
-		init_rwsem(&mi->mi_sem);
 
 		inode->i_sb = sb; /* sb may be NULL for some meta data files */
 		inode->i_blkbits = nilfs->ns_blocksize_bits;
@@ -483,8 +493,6 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 		atomic_set(&inode->i_count, 1);
 		inode->i_nlink = 1;
 		inode->i_ino = ino;
-		inode->i_mode = S_IFREG;
-		inode->i_private = mi;
 
 #ifdef INIT_UNUSED_INODE_FIELDS
 		atomic_set(&inode->i_writecount, 0);
@@ -515,9 +523,7 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 
 		mapping->host = NULL;  /* instead of inode */
 		mapping->flags = 0;
-		mapping_set_gfp_mask(mapping, gfp_mask);
 		mapping->assoc_mapping = NULL;
-		mapping->backing_dev_info = nilfs->ns_bdi;
 
 		inode->i_mapping = mapping;
 	}
@@ -530,10 +536,14 @@ struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
 {
 	struct inode *inode;
 
-	inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
+	inode = nilfs_mdt_new_common(nilfs, sb, ino);
 	if (!inode)
 		return NULL;
 
+	if (nilfs_mdt_init(inode, nilfs, NILFS_MDT_GFP, objsz) < 0) {
+		nilfs_destroy_inode(inode);
+		return NULL;
+	}
 	inode->i_op = &def_mdt_iops;
 	inode->i_fop = &def_mdt_fops;
 	inode->i_mapping->a_ops = &def_mdt_aops;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470fc..f44560224bd1 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -76,10 +76,12 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
+int nilfs_mdt_init(struct inode *inode, struct the_nilfs *nilfs,
+		   gfp_t gfp_mask, size_t objsz);
 struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
 			    size_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-				   ino_t, gfp_t, size_t);
+				   ino_t);
 void nilfs_mdt_destroy(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 void nilfs_mdt_set_shadow(struct inode *, struct inode *);
-- 
cgit v1.2.3


From 263d90cefc7d82a01c296c59532ff59d67c63509 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 20 Aug 2010 19:06:11 +0900
Subject: nilfs2: remove own inode hash used for GC

This uses inode hash function that vfs provides instead of the own
hash table for caching gc inodes.  This finally removes the own inode
hash from nilfs.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/gcinode.c   | 140 +++++++++-----------------------------------------
 fs/nilfs2/inode.c     |  22 ++++++++
 fs/nilfs2/ioctl.c     |  17 +++---
 fs/nilfs2/nilfs.h     |   9 ++--
 fs/nilfs2/segment.c   |   3 +-
 fs/nilfs2/the_nilfs.c |   8 +--
 fs/nilfs2/the_nilfs.h |   7 +--
 7 files changed, 64 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index cd19a3709bda..34f8f84a22e3 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
  * gcinodes), and this file provides lookup function of the dummy
  * inodes and their buffer read function.
  *
- * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
- * has to treat blocks that belong to a same file but have different
- * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separately from actual inodes, and their lookup
- * function (nilfs_gc_iget) is designed to be specified with a
- * checkpoint number argument as well as an inode number.
- *
  * Buffers and pages held by the dummy inodes will be released each
  * time after they are copied to a new log.  Dirty blocks made on the
  * current generation and the blocks to be moved by GC never overlap
@@ -180,124 +173,41 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 	return 0;
 }
 
-/*
- * nilfs_init_gccache() - allocate and initialize gc_inode hash table
- * @nilfs - the_nilfs
- *
- * Return Value: On success, 0.
- * On error, a negative error code is returned.
- */
-int nilfs_init_gccache(struct the_nilfs *nilfs)
-{
-	int loop;
-
-	BUG_ON(nilfs->ns_gc_inodes_h);
-
-	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
-
-	nilfs->ns_gc_inodes_h =
-		kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
-			GFP_NOFS);
-	if (nilfs->ns_gc_inodes_h == NULL)
-		return -ENOMEM;
-
-	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
-		INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
-	return 0;
-}
-
-/*
- * nilfs_destroy_gccache() - free gc_inode hash table
- * @nilfs - the nilfs
- */
-void nilfs_destroy_gccache(struct the_nilfs *nilfs)
+int nilfs_init_gcinode(struct inode *inode)
 {
-	if (nilfs->ns_gc_inodes_h) {
-		nilfs_remove_all_gcinode(nilfs);
-		kfree(nilfs->ns_gc_inodes_h);
-		nilfs->ns_gc_inodes_h = NULL;
-	}
-}
-
-static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
-				   __u64 cno)
-{
-	struct inode *inode;
-	struct nilfs_inode_info *ii;
-
-	inode = nilfs_mdt_new_common(nilfs, NULL, ino);
-	if (!inode)
-		return NULL;
-
-	if (nilfs_mdt_init(inode, nilfs, GFP_NOFS, 0) < 0) {
-		nilfs_destroy_inode(inode);
-		return NULL;
-	}
-	inode->i_op = NULL;
-	inode->i_fop = NULL;
-	inode->i_mapping->a_ops = &def_gcinode_aops;
-
-	ii = NILFS_I(inode);
-	ii->i_cno = cno;
-	ii->i_flags = 0;
-	ii->i_state = 1 << NILFS_I_GCINODE;
-	ii->i_bh = NULL;
-	nilfs_bmap_init_gc(ii->i_bmap);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	int ret;
 
-	return inode;
-}
+	ret = nilfs_mdt_init(inode, nilfs, GFP_NOFS, 0);
+	if (!ret) {
+		inode->i_mapping->a_ops = &def_gcinode_aops;
 
-static unsigned long ihash(ino_t ino, __u64 cno)
-{
-	return hash_long((unsigned long)((ino << 2) + cno),
-			 NILFS_GCINODE_HASH_BITS);
-}
+		ii->i_flags = 0;
+		nilfs_bmap_init_gc(ii->i_bmap);
 
-/*
- * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
- */
-struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
-{
-	struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
-	struct hlist_node *node;
-	struct inode *inode;
-
-	hlist_for_each_entry(inode, node, head, i_hash) {
-		if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
-			return inode;
-	}
-
-	inode = alloc_gcinode(nilfs, ino, cno);
-	if (likely(inode)) {
-		hlist_add_head(&inode->i_hash, head);
+		/*
+		 * Add the inode to GC inode list. Garbage Collection
+		 * is serialized and no two processes manipulate the
+		 * list simultaneously.
+		 */
+		igrab(inode);
 		list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
 	}
-	return inode;
-}
-
-/*
- * nilfs_clear_gcinode() - clear and free a gc inode
- */
-void nilfs_clear_gcinode(struct inode *inode)
-{
-	nilfs_mdt_destroy(inode);
+	return ret;
 }
 
-/*
- * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+/**
+ * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
  */
-void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
 {
-	struct hlist_head *head = nilfs->ns_gc_inodes_h;
-	struct hlist_node *node, *n;
-	struct inode *inode;
-	int loop;
+	struct list_head *head = &nilfs->ns_gc_inodes;
+	struct nilfs_inode_info *ii;
 
-	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
-		hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
-			hlist_del_init(&inode->i_hash);
-			list_del_init(&NILFS_I(inode)->i_dirty);
-			nilfs_clear_gcinode(inode); /* might sleep */
-		}
+	while (!list_empty(head)) {
+		ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
+		list_del_init(&ii->i_dirty);
+		iput(&ii->vfs_inode);
 	}
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6e9df85b5824..82cfdbc43e1c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -527,6 +527,28 @@ struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
 	return inode;
 }
 
+struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
+				__u64 cno)
+{
+	struct nilfs_iget_args args = { .ino = ino, .cno = cno, .for_gc = 1 };
+	struct inode *inode;
+	int err;
+
+	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = nilfs_init_gcinode(inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
 void nilfs_write_inode_common(struct inode *inode,
 			      struct nilfs_inode *raw_inode, int has_bmap)
 {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 0442ee3b394f..2ee6843c2e87 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -333,7 +333,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
 	return 0;
 }
 
-static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+static int nilfs_ioctl_move_blocks(struct super_block *sb,
 				   struct nilfs_argv *argv, void *buf)
 {
 	size_t nmembs = argv->v_nmembs;
@@ -348,7 +348,7 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
 	for (i = 0, vdesc = buf; i < nmembs; ) {
 		ino = vdesc->vd_ino;
 		cno = vdesc->vd_cno;
-		inode = nilfs_gc_iget(nilfs, ino, cno);
+		inode = nilfs_iget_for_gc(sb, ino, cno);
 		if (unlikely(inode == NULL)) {
 			ret = -ENOMEM;
 			goto failed;
@@ -356,11 +356,15 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
 		do {
 			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
 							   &buffers);
-			if (unlikely(ret < 0))
+			if (unlikely(ret < 0)) {
+				iput(inode);
 				goto failed;
+			}
 			vdesc++;
 		} while (++i < nmembs &&
 			 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+
+		iput(inode); /* The inode still remains in GC inode list */
 	}
 
 	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -566,7 +570,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	}
 
 	/*
-	 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+	 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
 	 * which will operates an inode list without blocking.
 	 * To protect the list from concurrent operations,
 	 * nilfs_ioctl_move_blocks should be atomic operation.
@@ -576,15 +580,14 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		goto out_free;
 	}
 
-	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
 	if (ret < 0)
 		printk(KERN_ERR "NILFS: GC failed during preparation: "
 			"cannot read source blocks: err=%d\n", ret);
 	else
 		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
 
-	if (ret < 0)
-		nilfs_remove_all_gcinode(nilfs);
+	nilfs_remove_all_gcinodes(nilfs);
 	clear_nilfs_gc_running(nilfs);
 
 out_free:
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index d3d54046e5f8..797cd437970e 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -248,6 +248,8 @@ extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
 extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
+				       unsigned long ino, __u64 cno);
 extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
@@ -292,11 +294,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
 int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
 				   struct buffer_head **);
 int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
-int nilfs_init_gccache(struct the_nilfs *);
-void nilfs_destroy_gccache(struct the_nilfs *);
-void nilfs_clear_gcinode(struct inode *);
-struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
-void nilfs_remove_all_gcinode(struct the_nilfs *);
+int nilfs_init_gcinode(struct inode *inode);
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
 
 /* gcdat.c */
 int nilfs_init_gcdat_inode(struct the_nilfs *);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index eee4b223c293..9cf71389f369 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2451,9 +2451,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
 	list_for_each_entry_safe(ii, n, head, i_dirty) {
 		if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
 			continue;
-		hlist_del_init(&ii->vfs_inode.i_hash);
 		list_del_init(&ii->i_dirty);
-		nilfs_clear_gcinode(&ii->vfs_inode);
+		iput(&ii->vfs_inode);
 	}
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 461b7211e14f..6a012b9e1b31 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -87,8 +87,8 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	init_rwsem(&nilfs->ns_writer_sem);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
+	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
-	nilfs->ns_gc_inodes_h = NULL;
 	init_rwsem(&nilfs->ns_segctor_sem);
 
 	return nilfs;
@@ -164,7 +164,6 @@ void put_nilfs(struct the_nilfs *nilfs)
 		nilfs_mdt_destroy(nilfs->ns_gc_dat);
 	}
 	if (nilfs_init(nilfs)) {
-		nilfs_destroy_gccache(nilfs);
 		brelse(nilfs->ns_sbh[0]);
 		brelse(nilfs->ns_sbh[1]);
 	}
@@ -736,11 +735,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	if (err)
 		goto failed_sbh;
 
-	/* Initialize gcinode cache */
-	err = nilfs_init_gccache(nilfs);
-	if (err)
-		goto failed_sbh;
-
 	set_nilfs_init(nilfs);
 	err = 0;
  out:
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f785a7b0ab99..c7ecd0c623a3 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -81,7 +81,6 @@ enum {
  * @ns_sufile: segusage file inode
  * @ns_gc_dat: shadow inode of the DAT file inode for GC
  * @ns_gc_inodes: dummy inodes to keep live blocks
- * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
  * @ns_blocksize_bits: bit length of block size
  * @ns_blocksize: block size
  * @ns_nsegments: number of segments in filesystem
@@ -165,9 +164,8 @@ struct the_nilfs {
 	struct inode	       *ns_sufile;
 	struct inode	       *ns_gc_dat;
 
-	/* GC inode list and hash table head */
+	/* GC inode list */
 	struct list_head	ns_gc_inodes;
-	struct hlist_head      *ns_gc_inodes_h;
 
 	/* Disk layout information (static) */
 	unsigned int		ns_blocksize_bits;
@@ -182,9 +180,6 @@ struct the_nilfs {
 	u32			ns_crc_seed;
 };
 
-#define NILFS_GCINODE_HASH_BITS		8
-#define NILFS_GCINODE_HASH_SIZE		(1<<NILFS_GCINODE_HASH_BITS)
-
 #define THE_NILFS_FNS(bit, name)					\
 static inline void set_nilfs_##name(struct the_nilfs *nilfs)		\
 {									\
-- 
cgit v1.2.3


From ba65ae4729bf81c58d9fc847f67d57eec525b042 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 14 Aug 2010 12:59:15 +0900
Subject: nilfs2: add checkpoint tree to nilfs object

To hold multiple versions of a filesystem in one sb instance, a new
on-memory structure is necessary to handle one or more checkpoints.

This adds a red-black tree of checkpoints to nilfs object, and adds
lookup and create functions for them.

Each checkpoint is represented by "nilfs_root" structure, and this
structure has rb_node to configure the rb-tree.

The nilfs_root object is identified with a checkpoint number.  For
each snapshot, a nilfs_root object is allocated and the checkpoint
number of snapshot is assigned to it.  For a regular mount
(i.e. current mode mount), NILFS_CPTREE_CURRENT_CNO constant is
assigned to the corresponding nilfs_root object.

Each nilfs_root object has an ifile inode and some counters.  These
items will displace those of nilfs_sb_info structure in successive
patches.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/the_nilfs.h | 42 +++++++++++++++++++++++
 2 files changed, 134 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6a012b9e1b31..f1d599273d9e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -89,6 +89,8 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
+	nilfs->ns_cptree = RB_ROOT;
+	spin_lock_init(&nilfs->ns_cptree_lock);
 	init_rwsem(&nilfs->ns_segctor_sem);
 
 	return nilfs;
@@ -809,6 +811,96 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
 	return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
 }
 
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
+{
+	struct rb_node *n;
+	struct nilfs_root *root;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+	n = nilfs->ns_cptree.rb_node;
+	while (n) {
+		root = rb_entry(n, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			n = n->rb_left;
+		} else if (cno > root->cno) {
+			n = n->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			return root;
+		}
+	}
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	return NULL;
+}
+
+struct nilfs_root *
+nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
+{
+	struct rb_node **p, *parent;
+	struct nilfs_root *root, *new;
+
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root)
+		return root;
+
+	new = kmalloc(sizeof(*root), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+
+	p = &nilfs->ns_cptree.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		parent = *p;
+		root = rb_entry(parent, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			p = &(*p)->rb_left;
+		} else if (cno > root->cno) {
+			p = &(*p)->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			kfree(new);
+			return root;
+		}
+	}
+
+	new->cno = cno;
+	new->ifile = NULL;
+	new->nilfs = nilfs;
+	atomic_set(&new->count, 1);
+	atomic_set(&new->inodes_count, 0);
+	atomic_set(&new->blocks_count, 0);
+
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
+
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	return new;
+}
+
+void nilfs_put_root(struct nilfs_root *root)
+{
+	if (atomic_dec_and_test(&root->count)) {
+		struct the_nilfs *nilfs = root->nilfs;
+
+		spin_lock(&nilfs->ns_cptree_lock);
+		rb_erase(&root->rb_node, &nilfs->ns_cptree);
+		spin_unlock(&nilfs->ns_cptree_lock);
+		if (root->ifile)
+			nilfs_mdt_destroy(root->ifile);
+
+		kfree(root);
+	}
+}
+
 /**
  * nilfs_find_sbinfo - find existing nilfs_sb_info structure
  * @nilfs: nilfs object
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index c7ecd0c623a3..0afede68a9e1 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/buffer_head.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -80,6 +81,8 @@ enum {
  * @ns_cpfile: checkpoint file inode
  * @ns_sufile: segusage file inode
  * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
+ * @ns_cptree_lock: lock protecting @ns_cptree
  * @ns_gc_inodes: dummy inodes to keep live blocks
  * @ns_blocksize_bits: bit length of block size
  * @ns_blocksize: block size
@@ -164,6 +167,10 @@ struct the_nilfs {
 	struct inode	       *ns_sufile;
 	struct inode	       *ns_gc_dat;
 
+	/* Checkpoint tree */
+	struct rb_root		ns_cptree;
+	spinlock_t		ns_cptree_lock;
+
 	/* GC inode list */
 	struct list_head	ns_gc_inodes;
 
@@ -200,6 +207,32 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 
+/**
+ * struct nilfs_root - nilfs root object
+ * @cno: checkpoint number
+ * @rb_node: red-black tree node
+ * @count: refcount of this structure
+ * @nilfs: nilfs object
+ * @ifile: inode file
+ * @root: root inode
+ * @inodes_count: number of inodes
+ * @blocks_count: number of blocks (Reserved)
+ */
+struct nilfs_root {
+	__u64 cno;
+	struct rb_node rb_node;
+
+	atomic_t count;
+	struct the_nilfs *nilfs;
+	struct inode *ifile;
+
+	atomic_t inodes_count;
+	atomic_t blocks_count;
+};
+
+/* Special checkpoint number */
+#define NILFS_CPTREE_CURRENT_CNO	0
+
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ		10
 
@@ -222,6 +255,10 @@ int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
+struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
+					     __u64 cno);
+void nilfs_put_root(struct nilfs_root *root);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
@@ -235,6 +272,11 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
 	atomic_inc(&nilfs->ns_count);
 }
 
+static inline void nilfs_get_root(struct nilfs_root *root)
+{
+	atomic_inc(&root->count);
+}
+
 static inline void
 nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-- 
cgit v1.2.3


From 4d8d9293dce503eb0e083e17a02a328d397e7f00 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 25 Aug 2010 17:45:44 +0900
Subject: nilfs2: set pointer to root object in inodes

This puts a pointer to nilfs_root object in the private part of
on-memory inode, and makes nilfs_iget function pick up the inode with
the same root object.

Non-root inodes inherit its nilfs_root object from parent inode.  That
of the root inode is allocated through nilfs_attach_checkpoint()
function.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c    | 27 ++++++++++++++++++++++-----
 fs/nilfs2/namei.c    |  5 +++--
 fs/nilfs2/nilfs.h    |  7 +++++--
 fs/nilfs2/recovery.c | 12 ++++++++----
 fs/nilfs2/super.c    | 37 ++++++++++++++++++++++++++++++-------
 5 files changed, 68 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 82cfdbc43e1c..7306fc7c4962 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -37,6 +37,7 @@
 struct nilfs_iget_args {
 	u64 ino;
 	__u64 cno;
+	struct nilfs_root *root;
 	int for_gc;
 };
 
@@ -284,6 +285,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
+	struct nilfs_root *root;
 	int err = -ENOMEM;
 	ino_t ino;
 
@@ -294,8 +296,10 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 
+	root = NILFS_I(dir)->i_root;
 	ii = NILFS_I(inode);
 	ii->i_state = 1 << NILFS_I_NEW;
+	ii->i_root = root;
 
 	err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
 	if (unlikely(err))
@@ -484,7 +488,7 @@ static int nilfs_iget_test(struct inode *inode, void *opaque)
 	struct nilfs_iget_args *args = opaque;
 	struct nilfs_inode_info *ii;
 
-	if (args->ino != inode->i_ino)
+	if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
 		return 0;
 
 	ii = NILFS_I(inode);
@@ -502,13 +506,21 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
 	if (args->for_gc) {
 		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
 		NILFS_I(inode)->i_cno = args->cno;
+		NILFS_I(inode)->i_root = NULL;
+	} else {
+		if (args->root && args->ino == NILFS_ROOT_INO)
+			nilfs_get_root(args->root);
+		NILFS_I(inode)->i_root = args->root;
 	}
 	return 0;
 }
 
-struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino)
 {
-	struct nilfs_iget_args args = { .ino = ino, .cno = 0, .for_gc = 0 };
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
 	struct inode *inode;
 	int err;
 
@@ -530,7 +542,9 @@ struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
 				__u64 cno)
 {
-	struct nilfs_iget_args args = { .ino = ino, .cno = cno, .for_gc = 1 };
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+	};
 	struct inode *inode;
 	int err;
 
@@ -682,6 +696,9 @@ static void nilfs_clear_inode(struct inode *inode)
 		nilfs_bmap_clear(ii->i_bmap);
 
 	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+
+	if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
+		nilfs_put_root(ii->i_root);
 }
 
 void nilfs_evict_inode(struct inode *inode)
@@ -690,7 +707,7 @@ void nilfs_evict_inode(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
-	if (inode->i_nlink || unlikely(is_bad_inode(inode))) {
+	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
 		if (inode->i_data.nrpages)
 			truncate_inode_pages(&inode->i_data, 0);
 		end_writeback(inode);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b4..1110d56a23f5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -70,7 +70,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	ino = nilfs_inode_by_name(dir, &dentry->d_name);
 	inode = NULL;
 	if (ino) {
-		inode = nilfs_iget(dir->i_sb, ino);
+		inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
@@ -87,7 +87,8 @@ struct dentry *nilfs_get_parent(struct dentry *child)
 	if (!ino)
 		return ERR_PTR(-ENOENT);
 
-	inode = nilfs_iget(child->d_inode->i_sb, ino);
+	inode = nilfs_iget(child->d_inode->i_sb,
+			   NILFS_I(child->d_inode)->i_root, ino);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	return d_obtain_alias(inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 797cd437970e..21d90c4b4e2d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -59,6 +59,7 @@ struct nilfs_inode_info {
 #endif
 	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
 					   disk inode */
+	struct nilfs_root *i_root;
 	struct inode vfs_inode;
 };
 
@@ -247,7 +248,8 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
-extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino);
 extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
 				       unsigned long ino, __u64 cno);
 extern void nilfs_update_inode(struct inode *, struct buffer_head *);
@@ -285,7 +287,8 @@ extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
 						      int flip);
 extern int nilfs_commit_super(struct nilfs_sb_info *, int);
 extern int nilfs_cleanup_super(struct nilfs_sb_info *);
-extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+			    struct nilfs_root **root);
 extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
 
 /* gcinode.c */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d0c35ef39f6a..a9a5ba8f57d5 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -504,6 +504,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 
 static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 				      struct nilfs_sb_info *sbi,
+				      struct nilfs_root *root,
 				      struct list_head *head,
 				      unsigned long *nr_salvaged_blocks)
 {
@@ -515,7 +516,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 	int err = 0, err2 = 0;
 
 	list_for_each_entry_safe(rb, n, head, list) {
-		inode = nilfs_iget(sbi->s_super, rb->ino);
+		inode = nilfs_iget(sbi->s_super, root, rb->ino);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			inode = NULL;
@@ -578,6 +579,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
  */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 				 struct nilfs_sb_info *sbi,
+				 struct nilfs_root *root,
 				 struct nilfs_recovery_info *ri)
 {
 	struct buffer_head *bh_sum = NULL;
@@ -649,7 +651,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 				goto failed;
 			if (flags & NILFS_SS_LOGEND) {
 				err = nilfs_recover_dsync_blocks(
-					nilfs, sbi, &dsync_blocks,
+					nilfs, sbi, root, &dsync_blocks,
 					&nsalvaged_blocks);
 				if (unlikely(err))
 					goto failed;
@@ -746,19 +748,20 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
 			      struct nilfs_sb_info *sbi,
 			      struct nilfs_recovery_info *ri)
 {
+	struct nilfs_root *root;
 	int err;
 
 	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
 		return 0;
 
-	err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+	err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
 	if (unlikely(err)) {
 		printk(KERN_ERR
 		       "NILFS: error loading the latest checkpoint.\n");
 		return err;
 	}
 
-	err = nilfs_do_roll_forward(nilfs, sbi, ri);
+	err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
 	if (unlikely(err))
 		goto failed;
 
@@ -789,6 +792,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
 
  failed:
 	nilfs_detach_checkpoint(sbi);
+	nilfs_put_root(root);
 	return err;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f3a00a3b2a03..a1c0e38a7706 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -391,18 +391,24 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 	return err;
 }
 
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+			    struct nilfs_root **rootp)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_root *root;
 	struct nilfs_checkpoint *raw_cp;
 	struct buffer_head *bh_cp;
-	int err;
+	int err = -ENOMEM;
+
+	root = nilfs_find_or_create_root(
+		nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
+	if (!root)
+		return err;
 
 	down_write(&nilfs->ns_super_sem);
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	up_write(&nilfs->ns_super_sem);
 
-	err = -ENOMEM;
 	sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
 	if (!sbi->s_ifile)
 		goto delist;
@@ -428,6 +434,8 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
 
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+
+	*rootp = root;
 	return 0;
 
  failed_bh:
@@ -440,6 +448,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	down_write(&nilfs->ns_super_sem);
 	list_del_init(&sbi->s_list);
 	up_write(&nilfs->ns_super_sem);
+	nilfs_put_root(root);
 
 	return err;
 }
@@ -551,12 +560,20 @@ static struct inode *
 nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
 {
 	struct inode *inode;
+	struct nilfs_root *root;
 
 	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
 	    ino != NILFS_SKETCH_INO)
 		return ERR_PTR(-ESTALE);
 
-	inode = nilfs_iget(sb, ino);
+	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs,
+				 NILFS_CPTREE_CURRENT_CNO);
+	if (!root)
+		return ERR_PTR(-ESTALE);
+
+	/* new file handle type is required to export snapshots */
+	inode = nilfs_iget(sb, root, ino);
+	nilfs_put_root(root);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (generation && inode->i_generation != generation) {
@@ -815,9 +832,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		 struct the_nilfs *nilfs)
 {
 	struct nilfs_sb_info *sbi;
+	struct nilfs_root *fsroot;
 	struct inode *root;
 	__u64 cno;
-	int err;
+	int err, curr_mnt;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -859,6 +877,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_sbi;
 
 	cno = nilfs_last_cno(nilfs);
+	curr_mnt = true;
 
 	if (sb->s_flags & MS_RDONLY) {
 		if (nilfs_test_opt(sbi, SNAPSHOT)) {
@@ -881,10 +900,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 				goto failed_sbi;
 			}
 			cno = sbi->s_snapshot_cno;
+			curr_mnt = false;
 		}
 	}
 
-	err = nilfs_attach_checkpoint(sbi, cno);
+	err = nilfs_attach_checkpoint(sbi, cno, curr_mnt, &fsroot);
 	if (err) {
 		printk(KERN_ERR "NILFS: error loading a checkpoint"
 		       " (checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -897,7 +917,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 			goto failed_checkpoint;
 	}
 
-	root = nilfs_iget(sb, NILFS_ROOT_INO);
+	root = nilfs_iget(sb, fsroot, NILFS_ROOT_INO);
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "NILFS: get root inode failed\n");
 		err = PTR_ERR(root);
@@ -917,6 +937,8 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_segctor;
 	}
 
+	nilfs_put_root(fsroot);
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
@@ -935,6 +957,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 
  failed_checkpoint:
 	nilfs_detach_checkpoint(sbi);
+	nilfs_put_root(fsroot);
 
  failed_sbi:
 	put_nilfs(nilfs);
-- 
cgit v1.2.3


From 8e656fd518784b49453f60c5f78b78703bc85cb2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 27 Aug 2010 00:23:02 +0900
Subject: nilfs2: make snapshots in checkpoint tree exportable

The previous export operations cannot handle multiple versions of
a filesystem if they belong to the same sb instance.

This adds a new type of file handle and extends export operations so
that they can get the inode specified by a checkpoint number as well
as an inode number and a generation number.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/export.h       |  17 ++++++
 fs/nilfs2/namei.c        | 137 +++++++++++++++++++++++++++++++++++++++++------
 fs/nilfs2/nilfs.h        |   3 --
 fs/nilfs2/super.c        |  52 +-----------------
 include/linux/exportfs.h |  13 +++++
 5 files changed, 151 insertions(+), 71 deletions(-)
 create mode 100644 fs/nilfs2/export.h

(limited to 'fs')

diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 000000000000..a71cc412b651
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
+#ifndef NILFS_EXPORT_H
+#define NILFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations nilfs_export_ops;
+
+struct nilfs_fid {
+	u64 cno;
+	u64 ino;
+	u32 gen;
+
+	u32 parent_gen;
+	u64 parent_ino;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1110d56a23f5..a65f46560fbe 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
 
 #include <linux/pagemap.h>
 #include "nilfs.h"
+#include "export.h"
 
+#define NILFS_FID_SIZE_NON_CONNECTABLE \
+	(offsetof(struct nilfs_fid, parent_gen) / 4)
+#define NILFS_FID_SIZE_CONNECTABLE	(sizeof(struct nilfs_fid) / 4)
 
 static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -77,23 +81,6 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	return d_splice_alias(inode, dentry);
 }
 
-struct dentry *nilfs_get_parent(struct dentry *child)
-{
-	unsigned long ino;
-	struct inode *inode;
-	struct qstr dotdot = {.name = "..", .len = 2};
-
-	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
-	if (!ino)
-		return ERR_PTR(-ENOENT);
-
-	inode = nilfs_iget(child->d_inode->i_sb,
-			   NILFS_I(child->d_inode)->i_root, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	return d_obtain_alias(inode);
-}
-
 /*
  * By the time this is called, we already have created
  * the directory cache entry for the new file, but it
@@ -469,6 +456,115 @@ out:
 	return err;
 }
 
+/*
+ * Export operations
+ */
+static struct dentry *nilfs_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct inode *inode;
+	struct qstr dotdot = {.name = "..", .len = 2};
+	struct nilfs_root *root;
+
+	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+
+	root = NILFS_I(child->d_inode)->i_root;
+
+	inode = nilfs_iget(child->d_inode->i_sb, root, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
+				       u64 ino, u32 gen)
+{
+	struct nilfs_root *root;
+	struct inode *inode;
+
+	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+	if (!root)
+		return ERR_PTR(-ESTALE);
+
+	inode = nilfs_iget(sb, root, ino);
+	nilfs_put_root(root);
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (gen && inode->i_generation != gen) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
+	     fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+	    (fh_type != FILEID_NILFS_WITH_PARENT &&
+	     fh_type != FILEID_NILFS_WITHOUT_PARENT))
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
+}
+
+static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+	    fh_type != FILEID_NILFS_WITH_PARENT)
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
+}
+
+static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
+			   int connectable)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+	struct inode *inode = dentry->d_inode;
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	int type;
+
+	if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
+	    (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+		return 255;
+
+	fid->cno = root->cno;
+	fid->ino = inode->i_ino;
+	fid->gen = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		fid->parent_ino = parent->i_ino;
+		fid->parent_gen = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+
+		type = FILEID_NILFS_WITH_PARENT;
+		*lenp = NILFS_FID_SIZE_CONNECTABLE;
+	} else {
+		type = FILEID_NILFS_WITHOUT_PARENT;
+		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+	}
+
+	return type;
+}
+
 const struct inode_operations nilfs_dir_inode_operations = {
 	.create		= nilfs_create,
 	.lookup		= nilfs_lookup,
@@ -493,3 +589,10 @@ const struct inode_operations nilfs_symlink_inode_operations = {
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 };
+
+const struct export_operations nilfs_export_ops = {
+	.encode_fh = nilfs_encode_fh,
+	.fh_to_dentry = nilfs_fh_to_dentry,
+	.fh_to_parent = nilfs_fh_to_parent,
+	.get_parent = nilfs_get_parent,
+};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 21d90c4b4e2d..aa7940f7ecf1 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -264,9 +264,6 @@ extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
 
-/* namei.c */
-extern struct dentry *nilfs_get_parent(struct dentry *);
-
 /* super.c */
 extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index a1c0e38a7706..adbf5826b837 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -48,10 +48,10 @@
 #include <linux/vfs.h>
 #include <linux/writeback.h>
 #include <linux/kobject.h>
-#include <linux/exportfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
+#include "export.h"
 #include "mdt.h"
 #include "alloc.h"
 #include "btree.h"
@@ -556,56 +556,6 @@ static const struct super_operations nilfs_sops = {
 	.show_options = nilfs_show_options
 };
 
-static struct inode *
-nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
-{
-	struct inode *inode;
-	struct nilfs_root *root;
-
-	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
-	    ino != NILFS_SKETCH_INO)
-		return ERR_PTR(-ESTALE);
-
-	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs,
-				 NILFS_CPTREE_CURRENT_CNO);
-	if (!root)
-		return ERR_PTR(-ESTALE);
-
-	/* new file handle type is required to export snapshots */
-	inode = nilfs_iget(sb, root, ino);
-	nilfs_put_root(root);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	if (generation && inode->i_generation != generation) {
-		iput(inode);
-		return ERR_PTR(-ESTALE);
-	}
-
-	return inode;
-}
-
-static struct dentry *
-nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
-		   int fh_type)
-{
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    nilfs_nfs_get_inode);
-}
-
-static struct dentry *
-nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
-		   int fh_type)
-{
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    nilfs_nfs_get_inode);
-}
-
-static const struct export_operations nilfs_export_ops = {
-	.fh_to_dentry = nilfs_fh_to_dentry,
-	.fh_to_parent = nilfs_fh_to_parent,
-	.get_parent = nilfs_get_parent,
-};
-
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index a9cd507f8cd2..28028988c862 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -67,6 +67,19 @@ enum fid_type {
 	 * 32 bit parent block number, 32 bit parent generation number
 	 */
 	FILEID_UDF_WITH_PARENT = 0x52,
+
+	/*
+	 * 64 bit checkpoint number, 64 bit inode number,
+	 * 32 bit generation number.
+	 */
+	FILEID_NILFS_WITHOUT_PARENT = 0x61,
+
+	/*
+	 * 64 bit checkpoint number, 64 bit inode number,
+	 * 32 bit generation number, 32 bit parent generation.
+	 * 64 bit parent inode number.
+	 */
+	FILEID_NILFS_WITH_PARENT = 0x62,
 };
 
 struct fid {
-- 
cgit v1.2.3


From e912a5b66837ee89fb025e67b5efeaa11930c2ce Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 14 Aug 2010 13:07:15 +0900
Subject: nilfs2: use root object to get ifile

This rewrites functions using ifile so that they get ifile from
nilfs_root object, and will remove sbi->s_ifile.  Some functions that
don't know the root object are extended to receive it from caller.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c    | 48 +++++++++++++++++++++---------------------------
 fs/nilfs2/nilfs.h    |  1 -
 fs/nilfs2/recovery.c |  3 +--
 fs/nilfs2/sb.h       |  3 ---
 fs/nilfs2/segment.c  | 43 ++++++++++++++++++++++++++-----------------
 fs/nilfs2/segment.h  |  7 ++++++-
 fs/nilfs2/super.c    | 42 +++++++++++++-----------------------------
 7 files changed, 67 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7306fc7c4962..7e883d5a5033 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -301,7 +301,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	ii->i_state = 1 << NILFS_I_NEW;
 	ii->i_root = root;
 
-	err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+	err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
 	if (unlikely(err))
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
@@ -358,16 +358,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	return ERR_PTR(err);
 }
 
-void nilfs_free_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-
-	/* XXX: check error code? Is there any thing I can do? */
-	(void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
-	atomic_dec(&sbi->s_inodes_count);
-}
-
 void nilfs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = NILFS_I(inode)->i_flags;
@@ -431,7 +421,8 @@ int nilfs_read_inode_common(struct inode *inode,
 	return 0;
 }
 
-static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+static int __nilfs_read_inode(struct super_block *sb,
+			      struct nilfs_root *root, unsigned long ino,
 			      struct inode *inode)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -441,11 +432,11 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 	int err;
 
 	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
 	if (unlikely(err))
 		goto bad_inode;
 
-	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+	raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
 
 	err = nilfs_read_inode_common(inode, raw_inode);
 	if (err)
@@ -468,14 +459,14 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
-	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
 	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
 	nilfs_set_inode_flags(inode);
 	return 0;
 
  failed_unmap:
-	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
 
  bad_inode:
@@ -530,7 +521,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	err = __nilfs_read_inode(sb, ino, inode);
+	err = __nilfs_read_inode(sb, root, ino, inode);
 	if (unlikely(err)) {
 		iget_failed(inode);
 		return ERR_PTR(err);
@@ -595,21 +586,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 {
 	ino_t ino = inode->i_ino;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	struct super_block *sb = inode->i_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct inode *ifile = ii->i_root->ifile;
 	struct nilfs_inode *raw_inode;
 
-	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+	raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
 
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
-		memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
 	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
 
 	nilfs_write_inode_common(inode, raw_inode, 0);
 		/* XXX: call with has_bmap = 0 is a workaround to avoid
 		   deadlock of bmap. This delays update of i_bmap to just
 		   before writing */
-	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
@@ -719,12 +709,16 @@ void nilfs_evict_inode(struct inode *inode)
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
 
+	/* TODO: some of the following operations may fail.  */
 	nilfs_truncate_bmap(ii, 0);
 	nilfs_mark_inode_dirty(inode);
 	end_writeback(inode);
+
+	nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+	atomic_dec(&NILFS_SB(sb)->s_inodes_count);
+
 	nilfs_clear_inode(inode);
-	nilfs_free_inode(inode);
-	/* nilfs_free_inode() marks inode buffer dirty */
+
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_transaction_commit(sb);
@@ -779,8 +773,8 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
 	spin_lock(&sbi->s_inode_lock);
 	if (ii->i_bh == NULL) {
 		spin_unlock(&sbi->s_inode_lock);
-		err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
-						  pbh);
+		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
+						  inode->i_ino, pbh);
 		if (unlikely(err))
 			return err;
 		spin_lock(&sbi->s_inode_lock);
@@ -860,7 +854,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 	}
 	nilfs_update_inode(inode, ibh);
 	nilfs_mdt_mark_buffer_dirty(ibh);
-	nilfs_mdt_mark_dirty(sbi->s_ifile);
+	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
 	brelse(ibh);
 	return 0;
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index aa7940f7ecf1..3870c109aba3 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -286,7 +286,6 @@ extern int nilfs_commit_super(struct nilfs_sb_info *, int);
 extern int nilfs_cleanup_super(struct nilfs_sb_info *);
 int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 			    struct nilfs_root **root);
-extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
 
 /* gcinode.c */
 int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index a9a5ba8f57d5..dcb5a9812c6c 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -773,7 +773,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
 			goto failed;
 		}
 
-		err = nilfs_attach_segment_constructor(sbi);
+		err = nilfs_attach_segment_constructor(sbi, root);
 		if (unlikely(err))
 			goto failed;
 
@@ -791,7 +791,6 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
 	}
 
  failed:
-	nilfs_detach_checkpoint(sbi);
 	nilfs_put_root(root);
 	return err;
 }
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504a..50c418e6438e 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -68,9 +68,6 @@ struct nilfs_sb_info {
 	spinlock_t s_inode_lock;	/* Lock for the nilfs inode.
 					   It covers s_dirty_files list */
 
-	/* Metadata files */
-	struct inode *s_ifile;		/* index file inode */
-
 	/* Inode allocator */
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9cf71389f369..2a6b74e6699d 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -763,12 +763,12 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
 	}
 }
 
-static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
+				     struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int ret = 0;
 
-	if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+	if (nilfs_mdt_fetch_dirty(root->ifile))
 		ret++;
 	if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
 		ret++;
@@ -793,7 +793,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	int ret = 0;
 
-	if (nilfs_test_metadata_dirty(sbi))
+	if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
 		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 
 	spin_lock(&sbi->s_inode_lock);
@@ -809,7 +809,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	nilfs_mdt_clear_dirty(sbi->s_ifile);
+	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
 	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
 	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
 	nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
@@ -869,7 +869,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	else
 		nilfs_checkpoint_set_minor(raw_cp);
 
-	nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+	nilfs_write_inode_common(sci->sc_root->ifile,
+				 &raw_cp->cp_ifile_inode, 1);
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
 	return 0;
 
@@ -894,13 +895,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
 	}
 }
 
-static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
-					    struct inode *ifile)
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
 {
 	struct nilfs_inode_info *ii;
 
 	list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
-		nilfs_fill_in_file_bmap(ifile, ii);
+		nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
 		set_bit(NILFS_I_COLLECTED, &ii->i_state);
 	}
 }
@@ -1143,7 +1143,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
 		/* Fall through */
 	case NILFS_ST_IFILE:
-		err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+		err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
 					      &nilfs_sc_file_ops);
 		if (unlikely(err))
 			break;
@@ -1984,6 +1984,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 					struct nilfs_sb_info *sbi)
 {
 	struct nilfs_inode_info *ii, *n;
+	struct inode *ifile = sci->sc_root->ifile;
 
 	spin_lock(&sbi->s_inode_lock);
  retry:
@@ -1994,14 +1995,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 
 			spin_unlock(&sbi->s_inode_lock);
 			err = nilfs_ifile_get_inode_block(
-				sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+				ifile, ii->vfs_inode.i_ino, &ibh);
 			if (unlikely(err)) {
 				nilfs_warning(sbi->s_super, __func__,
 					      "failed to get inode block.\n");
 				return err;
 			}
 			nilfs_mdt_mark_buffer_dirty(ibh);
-			nilfs_mdt_mark_dirty(sbi->s_ifile);
+			nilfs_mdt_mark_dirty(ifile);
 			spin_lock(&sbi->s_inode_lock);
 			if (likely(!ii->i_bh))
 				ii->i_bh = ibh;
@@ -2058,7 +2059,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	if (unlikely(err))
 		goto out;
 
-	if (nilfs_test_metadata_dirty(sbi))
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
 		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 
 	if (nilfs_segctor_clean(sci))
@@ -2090,7 +2091,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			goto failed;
 
 		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
-			nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+			nilfs_segctor_fill_in_file_bmap(sci);
 
 		if (mode == SC_LSEG_SR &&
 		    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2684,7 +2685,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
  * Setup & clean-up functions
  */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+					       struct nilfs_root *root)
 {
 	struct nilfs_sc_info *sci;
 
@@ -2695,6 +2697,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
 	sci->sc_sbi = sbi;
 	sci->sc_super = sbi->s_super;
 
+	nilfs_get_root(root);
+	sci->sc_root = root;
+
 	init_waitqueue_head(&sci->sc_wait_request);
 	init_waitqueue_head(&sci->sc_wait_daemon);
 	init_waitqueue_head(&sci->sc_wait_task);
@@ -2769,6 +2774,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 	WARN_ON(!list_empty(&sci->sc_segbufs));
 	WARN_ON(!list_empty(&sci->sc_write_logs));
 
+	nilfs_put_root(sci->sc_root);
+
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
 
 	del_timer_sync(&sci->sc_timer);
@@ -2778,6 +2785,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 /**
  * nilfs_attach_segment_constructor - attach a segment constructor
  * @sbi: nilfs_sb_info
+ * @root: root object of the current filesystem tree
  *
  * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
  * initializes it, and starts the segment constructor.
@@ -2787,7 +2795,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
  *
  * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+				     struct nilfs_root *root)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
@@ -2801,7 +2810,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
 		nilfs_detach_segment_constructor(sbi);
 	}
 
-	sbi->s_sc_info = nilfs_segctor_new(sbi);
+	sbi->s_sc_info = nilfs_segctor_new(sbi, root);
 	if (!sbi->s_sc_info)
 		return -ENOMEM;
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 675d932148a4..cd8056e7cbed 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
 #include <linux/nilfs2_fs.h>
 #include "sb.h"
 
+struct nilfs_root;
+
 /**
  * struct nilfs_recovery_info - Recovery information
  * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
  * struct nilfs_sc_info - Segment constructor information
  * @sc_super: Back pointer to super_block struct
  * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_root: root object of the current filesystem tree
  * @sc_nblk_inc: Block count of current generation
  * @sc_dirty_files: List of files to be written
  * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -129,6 +132,7 @@ struct nilfs_segsum_pointer {
 struct nilfs_sc_info {
 	struct super_block     *sc_super;
 	struct nilfs_sb_info   *sc_sbi;
+	struct nilfs_root      *sc_root;
 
 	unsigned long		sc_nblk_inc;
 
@@ -231,7 +235,8 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
 				void **);
 
-extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+				     struct nilfs_root *root);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 
 /* recovery.c */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index adbf5826b837..87c57810ae88 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -358,9 +358,9 @@ static void nilfs_put_super(struct super_block *sb)
 	down_write(&nilfs->ns_super_sem);
 	if (nilfs->ns_current == sbi)
 		nilfs->ns_current = NULL;
+	list_del_init(&sbi->s_list);
 	up_write(&nilfs->ns_super_sem);
 
-	nilfs_detach_checkpoint(sbi);
 	put_nilfs(sbi->s_nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
@@ -405,13 +405,12 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 	if (!root)
 		return err;
 
-	down_write(&nilfs->ns_super_sem);
-	list_add(&sbi->s_list, &nilfs->ns_supers);
-	up_write(&nilfs->ns_super_sem);
+	if (root->ifile)
+		goto reuse; /* already attached checkpoint */
 
-	sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
-	if (!sbi->s_ifile)
-		goto delist;
+	root->ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
+	if (!root->ifile)
+		goto failed;
 
 	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -427,7 +426,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 		}
 		goto failed;
 	}
-	err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
+	err = nilfs_read_inode_common(root->ifile, &raw_cp->cp_ifile_inode);
 	if (unlikely(err))
 		goto failed_bh;
 	atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
@@ -435,35 +434,18 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
+ reuse:
 	*rootp = root;
 	return 0;
 
  failed_bh:
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
  failed:
-	nilfs_mdt_destroy(sbi->s_ifile);
-	sbi->s_ifile = NULL;
-
- delist:
-	down_write(&nilfs->ns_super_sem);
-	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
 	nilfs_put_root(root);
 
 	return err;
 }
 
-void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
-{
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-
-	nilfs_mdt_destroy(sbi->s_ifile);
-	sbi->s_ifile = NULL;
-	down_write(&nilfs->ns_super_sem);
-	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
-}
-
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -862,7 +844,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		err = nilfs_attach_segment_constructor(sbi);
+		err = nilfs_attach_segment_constructor(sbi, fsroot);
 		if (err)
 			goto failed_checkpoint;
 	}
@@ -896,6 +878,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	}
 
 	down_write(&nilfs->ns_super_sem);
+	list_add(&sbi->s_list, &nilfs->ns_supers);
 	if (!nilfs_test_opt(sbi, SNAPSHOT))
 		nilfs->ns_current = sbi;
 	up_write(&nilfs->ns_super_sem);
@@ -906,7 +889,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	nilfs_detach_segment_constructor(sbi);
 
  failed_checkpoint:
-	nilfs_detach_checkpoint(sbi);
 	nilfs_put_root(fsroot);
 
  failed_sbi:
@@ -966,6 +948,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		up_write(&nilfs->ns_sem);
 	} else {
 		__u64 features;
+		struct nilfs_root *root;
 
 		/*
 		 * Mounting a RDONLY partition read-write, so reread and
@@ -987,7 +970,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 		sb->s_flags &= ~MS_RDONLY;
 
-		err = nilfs_attach_segment_constructor(sbi);
+		root = NILFS_I(sb->s_root->d_inode)->i_root;
+		err = nilfs_attach_segment_constructor(sbi, root);
 		if (err)
 			goto restore_opts;
 
-- 
cgit v1.2.3


From b7c0634204993d7c6678c852e4bd118426599111 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 14 Aug 2010 14:48:32 +0900
Subject: nilfs2: move inode count and block count into root object

This moves sbi->s_inodes_count and sbi->s_blocks_count into nilfs_root
object.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c   |  4 ++--
 fs/nilfs2/sb.h      |  2 --
 fs/nilfs2/segment.c |  4 ++--
 fs/nilfs2/super.c   | 11 ++++++-----
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e883d5a5033..ca09e4362d66 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -306,7 +306,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
-	atomic_inc(&sbi->s_inodes_count);
+	atomic_inc(&root->inodes_count);
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -715,7 +715,7 @@ void nilfs_evict_inode(struct inode *inode)
 	end_writeback(inode);
 
 	nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
-	atomic_dec(&NILFS_SB(sb)->s_inodes_count);
+	atomic_dec(&ii->i_root->inodes_count);
 
 	nilfs_clear_inode(inode);
 
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 50c418e6438e..3cc3675c3abe 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -44,8 +44,6 @@ struct nilfs_sc_info;
 struct nilfs_sb_info {
 	/* Snapshot status */
 	__u64 s_snapshot_cno;		/* Checkpoint number */
-	atomic_t s_inodes_count;
-	atomic_t s_blocks_count;	/* Reserved (might be deleted) */
 
 	/* Mount options */
 	unsigned long s_mount_opt;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2a6b74e6699d..b75306d642c2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -856,9 +856,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	raw_cp->cp_snapshot_list.ssl_next = 0;
 	raw_cp->cp_snapshot_list.ssl_prev = 0;
 	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+		cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
 	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+		cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
 	raw_cp->cp_nblk_inc =
 		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
 	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 87c57810ae88..acfa74e45f0b 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -429,8 +429,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 	err = nilfs_read_inode_common(root->ifile, &raw_cp->cp_ifile_inode);
 	if (unlikely(err))
 		goto failed_bh;
-	atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+
+	atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
 
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
@@ -449,8 +450,8 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
+	struct the_nilfs *nilfs = root->nilfs;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 	unsigned long long blocks;
 	unsigned long overhead;
@@ -486,7 +487,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bfree = nfreeblocks;
 	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
 		(buf->f_bfree - nrsvblocks) : 0;
-	buf->f_files = atomic_read(&sbi->s_inodes_count);
+	buf->f_files = atomic_read(&root->inodes_count);
 	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
 	buf->f_namelen = NILFS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
-- 
cgit v1.2.3


From fd52202930b7e8db48bee5a6fc6b1f438e822a23 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 14 Aug 2010 21:44:51 +0900
Subject: nilfs2: use checkpoint tree for mount check of snapshots

This rewrites nilfs_checkpoint_is_mounted() function so that it
decides whether a checkpoint is mounted by whether the corresponding
root object is found in checkpoint tree.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index f1d599273d9e..89c78562d0e9 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -954,26 +954,20 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 				int snapshot_mount)
 {
-	struct nilfs_sb_info *sbi;
-	int ret = 0;
+	struct nilfs_root *root;
+	int ret;
 
-	down_read(&nilfs->ns_super_sem);
-	if (cno == 0 || cno > nilfs->ns_cno)
-		goto out_unlock;
+	if (cno < 0 || cno > nilfs->ns_cno)
+		return false;
 
-	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-		if (sbi->s_snapshot_cno == cno &&
-		    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
-					/* exclude read-only mounts */
-			ret++;
-			break;
-		}
-	}
-	/* for protecting recent checkpoints */
 	if (cno >= nilfs_last_cno(nilfs))
-		ret++;
+		return true;	/* protect recent checkpoints */
 
- out_unlock:
-	up_read(&nilfs->ns_super_sem);
+	ret = false;
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root) {
+		ret = true;
+		nilfs_put_root(root);
+	}
 	return ret;
 }
-- 
cgit v1.2.3


From dc3d3b810a644dfa329efaa230cd514226f8981d Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 15 Aug 2010 23:33:57 +0900
Subject: nilfs2: deny write access to inodes in snapshots

Snapshots of nilfs are read-only.

After super block instances (sb) will be unified, nilfs will need to
check write access by a way other than implicit test with
IS_RDONLY(inode).  This is because IS_RDONLY() refers to MS_RDONLY bit
of inode->i_sb->s_flags and it will become inaccurate after the
unification of sb.

To prepare for the issue, this uses i_op->permission to deny write
access to inodes in snapshots.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 11 +++++++++++
 fs/nilfs2/namei.c |  1 +
 fs/nilfs2/nilfs.h |  4 +---
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index ca09e4362d66..3efef0ecfa24 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -764,6 +764,17 @@ out_err:
 	return err;
 }
 
+int nilfs_permission(struct inode *inode, int mask)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	if ((mask & MAY_WRITE) && root &&
+	    root->cno != NILFS_CPTREE_CURRENT_CNO)
+		return -EROFS; /* snapshot is not writable */
+
+	return generic_permission(inode, mask, NULL);
+}
+
 int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
 			   struct buffer_head **pbh)
 {
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a65f46560fbe..185d1607cb00 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -588,6 +588,7 @@ const struct inode_operations nilfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.permission     = nilfs_permission,
 };
 
 const struct export_operations nilfs_export_ops = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 3870c109aba3..cf5507a2a178 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -201,12 +201,9 @@ static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
  */
 #ifdef CONFIG_NILFS_POSIX_ACL
 #error "NILFS: not yet supported POSIX ACL"
-extern int nilfs_permission(struct inode *, int, struct nameidata *);
 extern int nilfs_acl_chmod(struct inode *);
 extern int nilfs_init_acl(struct inode *, struct inode *);
 #else
-#define nilfs_permission   NULL
-
 static inline int nilfs_acl_chmod(struct inode *inode)
 {
 	return 0;
@@ -256,6 +253,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
+int nilfs_permission(struct inode *inode, int mask);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
 				  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
-- 
cgit v1.2.3


From 367ea33486a68f935a01311a3be9b7e97d2e5ead Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 26 Aug 2010 01:52:51 +0900
Subject: nilfs2: split out nilfs_get_root_dentry

This splits the code to allocate root dentry into a separate routine
for convenience in successive changes.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 54 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index acfa74e45f0b..1e12930f8b94 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -750,6 +750,39 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
 	return 0;
 }
 
+static int nilfs_get_root_dentry(struct super_block *sb,
+				 struct nilfs_root *root,
+				 struct dentry **root_dentry)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret = 0;
+
+	inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "NILFS: get root inode failed\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+	if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
+		iput(inode);
+		printk(KERN_ERR "NILFS: corrupt root inode.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dentry = d_alloc_root(inode);
+	if (!dentry) {
+		iput(inode);
+		printk(KERN_ERR "NILFS: get root dentry failed\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	*root_dentry = dentry;
+ out:
+	return ret;
+}
+
 /**
  * nilfs_fill_super() - initialize a super block instance
  * @sb: super_block
@@ -766,7 +799,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 {
 	struct nilfs_sb_info *sbi;
 	struct nilfs_root *fsroot;
-	struct inode *root;
 	__u64 cno;
 	int err, curr_mnt;
 
@@ -850,25 +882,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 			goto failed_checkpoint;
 	}
 
-	root = nilfs_iget(sb, fsroot, NILFS_ROOT_INO);
-	if (IS_ERR(root)) {
-		printk(KERN_ERR "NILFS: get root inode failed\n");
-		err = PTR_ERR(root);
-		goto failed_segctor;
-	}
-	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-		iput(root);
-		printk(KERN_ERR "NILFS: corrupt root inode.\n");
-		err = -EINVAL;
-		goto failed_segctor;
-	}
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
-		printk(KERN_ERR "NILFS: get root dentry failed\n");
-		err = -ENOMEM;
+	err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
+	if (err)
 		goto failed_segctor;
-	}
 
 	nilfs_put_root(fsroot);
 
-- 
cgit v1.2.3


From ab4d8f7ebf33beff97e766d18db47f1ea9635769 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 26 Aug 2010 02:15:41 +0900
Subject: nilfs2: split out nilfs_attach_snapshot

This splits the code to attach snapshots into a separate routine for
convenience sake.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 73 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1e12930f8b94..ebeb746c4845 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -783,6 +783,40 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 	return ret;
 }
 
+static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
+				 struct dentry **root_dentry)
+{
+	struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+	struct nilfs_root *root;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0) {
+		ret = (ret == -ENOENT) ? -EINVAL : ret;
+		goto out;
+	} else if (!ret) {
+		printk(KERN_ERR "NILFS: The specified checkpoint is "
+		       "not a snapshot (checkpoint number=%llu).\n",
+		       (unsigned long long)cno);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+	if (ret) {
+		printk(KERN_ERR "NILFS: error loading snapshot "
+		       "(checkpoint number=%llu).\n",
+	       (unsigned long long)cno);
+		goto out;
+	}
+	ret = nilfs_get_root_dentry(s, root, root_dentry);
+	nilfs_put_root(root);
+ out:
+	return ret;
+}
+
 /**
  * nilfs_fill_super() - initialize a super block instance
  * @sb: super_block
@@ -800,7 +834,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	struct nilfs_sb_info *sbi;
 	struct nilfs_root *fsroot;
 	__u64 cno;
-	int err, curr_mnt;
+	int err;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -841,35 +875,17 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	if (err)
 		goto failed_sbi;
 
-	cno = nilfs_last_cno(nilfs);
-	curr_mnt = true;
-
-	if (sb->s_flags & MS_RDONLY) {
-		if (nilfs_test_opt(sbi, SNAPSHOT)) {
-			down_read(&nilfs->ns_segctor_sem);
-			err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
-						       sbi->s_snapshot_cno);
-			up_read(&nilfs->ns_segctor_sem);
-			if (err < 0) {
-				if (err == -ENOENT)
-					err = -EINVAL;
-				goto failed_sbi;
-			}
-			if (!err) {
-				printk(KERN_ERR
-				       "NILFS: The specified checkpoint is "
-				       "not a snapshot "
-				       "(checkpoint number=%llu).\n",
-				       (unsigned long long)sbi->s_snapshot_cno);
-				err = -EINVAL;
-				goto failed_sbi;
-			}
-			cno = sbi->s_snapshot_cno;
-			curr_mnt = false;
-		}
+	if (nilfs_test_opt(sbi, SNAPSHOT)) {
+		err = nilfs_attach_snapshot(sb, sbi->s_snapshot_cno,
+					    &sb->s_root);
+		if (err)
+			goto failed_sbi;
+
+		goto add_to_supers;
 	}
 
-	err = nilfs_attach_checkpoint(sbi, cno, curr_mnt, &fsroot);
+	cno = nilfs_last_cno(nilfs);
+	err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
 	if (err) {
 		printk(KERN_ERR "NILFS: error loading a checkpoint"
 		       " (checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -894,6 +910,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		up_write(&nilfs->ns_sem);
 	}
 
+ add_to_supers:
 	down_write(&nilfs->ns_super_sem);
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	if (!nilfs_test_opt(sbi, SNAPSHOT))
-- 
cgit v1.2.3


From f11459ad7dab9e9eb5a05b8bd3bec338ea8f485d Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 16 Aug 2010 01:54:52 +0900
Subject: nilfs2: do not allocate multiple super block instances for a device

This stops allocating multiple super block instances for a device.

All snapshots and a current mode mount (i.e. latest tree) will be
controlled with nilfs_root objects that are kept within an sb
instance.

nilfs_get_sb() is rewritten so that it always has a root object for
the latest tree and snapshots make additional root objects.

The root dentry of the latest tree is binded to sb->s_root even if it
isn't attached on a directory.  Root dentries of snapshots or the
latest tree are binded to mnt->mnt_root on which they are mounted.

With this patch, nilfs_find_sbinfo() function, nilfs->ns_supers list,
and nilfs->ns_current back pointer, are deleted.  In addition,
init_nilfs() and load_nilfs() are simplified since they will be called
once for a device, not repeatedly called for mount points.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/sb.h        |   5 --
 fs/nilfs2/super.c     | 216 +++++++++++++++++++++++---------------------------
 fs/nilfs2/the_nilfs.c |  89 +--------------------
 fs/nilfs2/the_nilfs.h |  16 ----
 4 files changed, 101 insertions(+), 225 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 3cc3675c3abe..35a07157b980 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -42,9 +42,6 @@ struct nilfs_sc_info;
  * NILFS super-block data in memory
  */
 struct nilfs_sb_info {
-	/* Snapshot status */
-	__u64 s_snapshot_cno;		/* Checkpoint number */
-
 	/* Mount options */
 	unsigned long s_mount_opt;
 	uid_t s_resuid;
@@ -57,8 +54,6 @@ struct nilfs_sb_info {
 	/* Fundamental members */
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
-	struct list_head s_list;	/* list head for nilfs->ns_supers */
-	atomic_t s_count;		/* reference count */
 
 	/* Segment constructor */
 	struct list_head s_dirty_files;	/* dirty files list */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ebeb746c4845..2e58e7c629b5 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -355,16 +355,11 @@ static void nilfs_put_super(struct super_block *sb)
 		nilfs_cleanup_super(sbi);
 		up_write(&nilfs->ns_sem);
 	}
-	down_write(&nilfs->ns_super_sem);
-	if (nilfs->ns_current == sbi)
-		nilfs->ns_current = NULL;
-	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
 
 	put_nilfs(sbi->s_nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
-	nilfs_put_sbinfo(sbi);
+	kfree(sbi);
 }
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -500,12 +495,12 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct super_block *sb = vfs->mnt_sb;
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
 
 	if (!nilfs_test_opt(sbi, BARRIER))
 		seq_puts(seq, ",nobarrier");
-	if (nilfs_test_opt(sbi, SNAPSHOT))
-		seq_printf(seq, ",cp=%llu",
-			   (unsigned long long int)sbi->s_snapshot_cno);
+	if (root->cno != NILFS_CPTREE_CURRENT_CNO)
+		seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
 	if (nilfs_test_opt(sbi, ERRORS_PANIC))
 		seq_puts(seq, ",errors=panic");
 	if (nilfs_test_opt(sbi, ERRORS_CONT))
@@ -605,27 +600,11 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 			if (match_int(&args[0], &option) || option <= 0)
 				return 0;
 			if (is_remount) {
-				if (!nilfs_test_opt(sbi, SNAPSHOT)) {
-					printk(KERN_ERR
-					       "NILFS: cannot change regular "
-					       "mount to snapshot.\n");
-					return 0;
-				} else if (option != sbi->s_snapshot_cno) {
-					printk(KERN_ERR
-					       "NILFS: cannot remount to a "
-					       "different snapshot.\n");
-					return 0;
-				}
-				break;
-			}
-			if (!(sb->s_flags & MS_RDONLY)) {
-				printk(KERN_ERR "NILFS: cannot mount snapshot "
-				       "read/write.  A read-only option is "
-				       "required.\n");
+				printk(KERN_ERR
+				       "NILFS: \"%s\" option is invalid "
+				       "for remount.\n", p);
 				return 0;
 			}
-			sbi->s_snapshot_cno = option;
-			nilfs_set_opt(sbi, SNAPSHOT);
 			break;
 		case Opt_norecovery:
 			nilfs_set_opt(sbi, NORECOVERY);
@@ -771,16 +750,32 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 		goto out;
 	}
 
-	dentry = d_alloc_root(inode);
-	if (!dentry) {
-		iput(inode);
-		printk(KERN_ERR "NILFS: get root dentry failed\n");
-		ret = -ENOMEM;
-		goto out;
+	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+		dentry = d_find_alias(inode);
+		if (!dentry) {
+			dentry = d_alloc_root(inode);
+			if (!dentry) {
+				iput(inode);
+				ret = -ENOMEM;
+				goto failed_dentry;
+			}
+		} else {
+			iput(inode);
+		}
+	} else {
+		dentry = d_obtain_alias(inode);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			goto failed_dentry;
+		}
 	}
 	*root_dentry = dentry;
  out:
 	return ret;
+
+ failed_dentry:
+	printk(KERN_ERR "NILFS: get root dentry failed\n");
+	goto out;
 }
 
 static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
@@ -817,6 +812,25 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 	return ret;
 }
 
+static int nilfs_tree_was_touched(struct dentry *root_dentry)
+{
+	return atomic_read(&root_dentry->d_count) > 1;
+}
+
+/**
+ * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
+ * @root_dentry: root dentry of the tree to be shrunk
+ *
+ * This function returns true if the tree was in-use.
+ */
+static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
+{
+	if (have_submounts(root_dentry))
+		return true;
+	shrink_dcache_parent(root_dentry);
+	return nilfs_tree_was_touched(root_dentry);
+}
+
 /**
  * nilfs_fill_super() - initialize a super block instance
  * @sb: super_block
@@ -845,7 +859,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	get_nilfs(nilfs);
 	sbi->s_nilfs = nilfs;
 	sbi->s_super = sb;
-	atomic_set(&sbi->s_count, 1);
 
 	err = init_nilfs(nilfs, sbi, (char *)data);
 	if (err)
@@ -853,7 +866,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 
 	spin_lock_init(&sbi->s_inode_lock);
 	INIT_LIST_HEAD(&sbi->s_dirty_files);
-	INIT_LIST_HEAD(&sbi->s_list);
 
 	/*
 	 * Following initialization is overlapped because
@@ -875,20 +887,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	if (err)
 		goto failed_sbi;
 
-	if (nilfs_test_opt(sbi, SNAPSHOT)) {
-		err = nilfs_attach_snapshot(sb, sbi->s_snapshot_cno,
-					    &sb->s_root);
-		if (err)
-			goto failed_sbi;
-
-		goto add_to_supers;
-	}
-
 	cno = nilfs_last_cno(nilfs);
 	err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
 	if (err) {
-		printk(KERN_ERR "NILFS: error loading a checkpoint"
-		       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+		printk(KERN_ERR "NILFS: error loading last checkpoint "
+		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
 		goto failed_sbi;
 	}
 
@@ -910,13 +913,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		up_write(&nilfs->ns_sem);
 	}
 
- add_to_supers:
-	down_write(&nilfs->ns_super_sem);
-	list_add(&sbi->s_list, &nilfs->ns_supers);
-	if (!nilfs_test_opt(sbi, SNAPSHOT))
-		nilfs->ns_current = sbi;
-	up_write(&nilfs->ns_super_sem);
-
 	return 0;
 
  failed_segctor:
@@ -928,7 +924,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
  failed_sbi:
 	put_nilfs(nilfs);
 	sb->s_fs_info = NULL;
-	nilfs_put_sbinfo(sbi);
+	kfree(sbi);
 	return err;
 }
 
@@ -938,13 +934,10 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	unsigned long old_sb_flags;
 	struct nilfs_mount_options old_opts;
-	int was_snapshot, err;
+	int err;
 
-	down_write(&nilfs->ns_super_sem);
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
-	old_opts.snapshot_cno = sbi->s_snapshot_cno;
-	was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
 
 	if (!parse_options(data, sb, 1)) {
 		err = -EINVAL;
@@ -953,11 +946,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
 
 	err = -EINVAL;
-	if (was_snapshot && !(*flags & MS_RDONLY)) {
-		printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
-		       "read/write.\n", sb->s_id);
-		goto restore_opts;
-	}
 
 	if (!nilfs_valid_fs(nilfs)) {
 		printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -1014,14 +1002,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		up_write(&nilfs->ns_sem);
 	}
  out:
-	up_write(&nilfs->ns_super_sem);
 	return 0;
 
  restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
-	sbi->s_snapshot_cno = old_opts.snapshot_cno;
-	up_write(&nilfs->ns_super_sem);
 	return err;
 }
 
@@ -1075,18 +1060,14 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 
 static int nilfs_set_bdev_super(struct super_block *s, void *data)
 {
-	struct nilfs_super_data *sd = data;
-
-	s->s_bdev = sd->bdev;
+	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
 	return 0;
 }
 
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
-	struct nilfs_super_data *sd = data;
-
-	return sd->sbi && s->s_fs_info == (void *)sd->sbi;
+	return (void *)s->s_bdev == data;
 }
 
 static int
@@ -1097,7 +1078,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	fmode_t mode = FMODE_READ;
 	struct the_nilfs *nilfs;
-	int err, need_to_close = 1;
+	struct dentry *root_dentry;
+	int err, s_new = false;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
@@ -1106,12 +1088,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(sd.bdev))
 		return PTR_ERR(sd.bdev);
 
-	/*
-	 * To get mount instance using sget() vfs-routine, NILFS needs
-	 * much more information than normal filesystems to identify mount
-	 * instance.  For snapshot mounts, not only a mount type (ro-mount
-	 * or rw-mount) but also a checkpoint number is required.
-	 */
 	sd.cno = 0;
 	sd.flags = flags;
 	if (nilfs_identify((char *)data, &sd)) {
@@ -1127,38 +1103,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 	mutex_lock(&nilfs->ns_mount_mutex);
 
-	if (!sd.cno) {
-		/*
-		 * Check if an exclusive mount exists or not.
-		 * Snapshot mounts coexist with a current mount
-		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
-		 * ro-mount are mutually exclusive.
-		 */
-		down_read(&nilfs->ns_super_sem);
-		if (nilfs->ns_current &&
-		    ((nilfs->ns_current->s_super->s_flags ^ flags)
-		     & MS_RDONLY)) {
-			up_read(&nilfs->ns_super_sem);
-			err = -EBUSY;
-			goto failed_unlock;
-		}
-		up_read(&nilfs->ns_super_sem);
-	}
-
-	/*
-	 * Find existing nilfs_sb_info struct
-	 */
-	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-
-	/*
-	 * Get super block instance holding the nilfs_sb_info struct.
-	 * A new instance is allocated if no existing mount is present or
-	 * existing instance has been unmounted.
-	 */
-	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-	if (sd.sbi)
-		nilfs_put_sbinfo(sd.sbi);
-
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
 		goto failed_unlock;
@@ -1167,6 +1112,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (!s->s_root) {
 		char b[BDEVNAME_SIZE];
 
+		s_new = true;
+
 		/* New superblock instance created */
 		s->s_flags = flags;
 		s->s_mode = mode;
@@ -1179,16 +1126,53 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto cancel_new;
 
 		s->s_flags |= MS_ACTIVE;
-		need_to_close = 0;
+	} else if (!sd.cno) {
+		int busy = false;
+
+		if (nilfs_tree_was_touched(s->s_root)) {
+			busy = nilfs_try_to_shrink_tree(s->s_root);
+			if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
+				printk(KERN_ERR "NILFS: the device already "
+				       "has a %s mount.\n",
+				       (s->s_flags & MS_RDONLY) ?
+				       "read-only" : "read/write");
+				err = -EBUSY;
+				goto failed_super;
+			}
+		}
+		if (!busy) {
+			/*
+			 * Try remount to setup mount states if the current
+			 * tree is not mounted and only snapshots use this sb.
+			 */
+			err = nilfs_remount(s, &flags, data);
+			if (err)
+				goto failed_super;
+		}
+	}
+
+	if (sd.cno) {
+		err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
+		if (err) {
+			if (s_new)
+				goto cancel_new;
+			goto failed_super;
+		}
+	} else {
+		root_dentry = dget(s->s_root);
 	}
 
 	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
-	if (need_to_close)
+	if (!s_new)
 		close_bdev_exclusive(sd.bdev, mode);
-	simple_set_mnt(mnt, s);
+
+	mnt->mnt_sb = s;
+	mnt->mnt_root = root_dentry;
 	return 0;
 
+ failed_super:
+	deactivate_locked_super(s);
  failed_unlock:
 	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
@@ -1202,7 +1186,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	put_nilfs(nilfs);
 	deactivate_locked_super(s);
 	/*
-	 * deactivate_locked_super() invokes close_bdev_exclusive().
+	 * This deactivate_locked_super() invokes close_bdev_exclusive().
 	 * We must finish all post-cleaning before this call;
 	 * put_nilfs() needs the block device.
 	 */
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 89c78562d0e9..960c28797bb2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -82,11 +82,9 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_count, 1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
-	init_rwsem(&nilfs->ns_super_sem);
 	mutex_init(&nilfs->ns_mount_mutex);
 	init_rwsem(&nilfs->ns_writer_sem);
 	INIT_LIST_HEAD(&nilfs->ns_list);
-	INIT_LIST_HEAD(&nilfs->ns_supers);
 	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_cptree = RB_ROOT;
@@ -307,15 +305,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	int valid_fs = nilfs_valid_fs(nilfs);
 	int err;
 
-	if (nilfs_loaded(nilfs)) {
-		if (valid_fs ||
-		    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
-			return 0;
-		printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
-		       "recovery state.\n");
-		return -EINVAL;
-	}
-
 	if (!valid_fs) {
 		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
 		if (s_flags & MS_RDONLY) {
@@ -632,12 +621,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
  *
  * init_nilfs() performs common initialization per block device (e.g.
  * reading the super block, getting disk layout information, initializing
- * shared fields in the_nilfs). It takes on some portion of the jobs
- * typically done by a fill_super() routine. This division arises from
- * the nature that multiple NILFS instances may be simultaneously
- * mounted on a device.
- * For multiple mounts on the same device, only the first mount
- * invokes these tasks.
+ * shared fields in the_nilfs).
  *
  * Return Value: On success, 0 is returned. On error, a negative error
  * code is returned.
@@ -651,27 +635,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	int err;
 
 	down_write(&nilfs->ns_sem);
-	if (nilfs_init(nilfs)) {
-		/* Load values from existing the_nilfs */
-		sbp = nilfs->ns_sbp[0];
-		err = nilfs_store_magic_and_option(sb, sbp, data);
-		if (err)
-			goto out;
-
-		err = nilfs_check_feature_compatibility(sb, sbp);
-		if (err)
-			goto out;
-
-		blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
-		if (sb->s_blocksize != blocksize &&
-		    !sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
-			       blocksize);
-			err = -EINVAL;
-		}
-		sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
-		goto out;
-	}
 
 	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
 	if (!blocksize) {
@@ -901,56 +864,6 @@ void nilfs_put_root(struct nilfs_root *root)
 	}
 }
 
-/**
- * nilfs_find_sbinfo - find existing nilfs_sb_info structure
- * @nilfs: nilfs object
- * @rw_mount: mount type (non-zero value for read/write mount)
- * @cno: checkpoint number (zero for read-only mount)
- *
- * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
- * @rw_mount and @cno (in case of snapshots) matched.  If no instance
- * was found, NULL is returned.  Although the super block instance can
- * be unmounted after this function returns, the nilfs_sb_info struct
- * is kept on memory until nilfs_put_sbinfo() is called.
- */
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
-					int rw_mount, __u64 cno)
-{
-	struct nilfs_sb_info *sbi;
-
-	down_read(&nilfs->ns_super_sem);
-	/*
-	 * The SNAPSHOT flag and sb->s_flags are supposed to be
-	 * protected with nilfs->ns_super_sem.
-	 */
-	sbi = nilfs->ns_current;
-	if (rw_mount) {
-		if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
-			goto found; /* read/write mount */
-		else
-			goto out;
-	} else if (cno == 0) {
-		if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
-			goto found; /* read-only mount */
-		else
-			goto out;
-	}
-
-	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-		if (nilfs_test_opt(sbi, SNAPSHOT) &&
-		    sbi->s_snapshot_cno == cno)
-			goto found; /* snapshot mount */
-	}
- out:
-	up_read(&nilfs->ns_super_sem);
-	return NULL;
-
- found:
-	atomic_inc(&sbi->s_count);
-	up_read(&nilfs->ns_super_sem);
-	return sbi;
-}
-
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 				int snapshot_mount)
 {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 0afede68a9e1..7b43693a69c5 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -52,16 +52,13 @@ enum {
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
- * @ns_super_sem: semaphore for global operations across super block instances
  * @ns_mount_mutex: mutex protecting mount process of nilfs
  * @ns_writer_sem: semaphore protecting ns_writer attach/detach
- * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
  * @ns_sbwtime: previous write time of super block
  * @ns_sbwcount: write count of super block
  * @ns_sbsize: size of valid data in super block
- * @ns_supers: list of nilfs super block structs
  * @ns_seg_seq: segment sequence counter
  * @ns_segnum: index number of the latest full segment.
  * @ns_nextnum: index number of the full segment index to be used next
@@ -104,16 +101,9 @@ struct the_nilfs {
 	struct backing_dev_info *ns_bdi;
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
-	struct rw_semaphore	ns_super_sem;
 	struct mutex		ns_mount_mutex;
 	struct rw_semaphore	ns_writer_sem;
 
-	/*
-	 * components protected by ns_super_sem
-	 */
-	struct nilfs_sb_info   *ns_current;
-	struct list_head	ns_supers;
-
 	/*
 	 * used for
 	 * - loading the latest checkpoint exclusively.
@@ -294,12 +284,6 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	up_write(&nilfs->ns_writer_sem);
 }
 
-static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
-{
-	if (atomic_dec_and_test(&sbi->s_count))
-		kfree(sbi);
-}
-
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
 {
 	unsigned valid_fs;
-- 
cgit v1.2.3


From 348fe8da13621b3d14ab2d156e74551611997017 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 9 Sep 2010 02:07:56 +0900
Subject: nilfs2: simplify life cycle management of nilfs object

This stops pre-allocating nilfs object in nilfs_get_sb routine, and
stops managing its life cycle by reference counting.

nilfs_find_or_create_nilfs() function, nilfs->ns_mount_mutex,
nilfs_objects list, and the reference counter will be removed through
the simplification.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c     |  4 +--
 fs/nilfs2/super.c     | 67 +++++++++++++++------------------------------
 fs/nilfs2/the_nilfs.c | 75 ++++-----------------------------------------------
 fs/nilfs2/the_nilfs.h | 16 ++---------
 4 files changed, 31 insertions(+), 131 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2ee6843c2e87..338858fc907c 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -117,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
 		goto out;
 
-	mutex_lock(&nilfs->ns_mount_mutex);
+	down_read(&inode->i_sb->s_umount);
 
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_change_cpmode(
@@ -127,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 	else
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 
-	mutex_unlock(&nilfs->ns_mount_mutex);
+	up_read(&inode->i_sb->s_umount);
 out:
 	mnt_drop_write(filp->f_path.mnt);
 	return ret;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e58e7c629b5..5893cb27c909 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -356,7 +356,7 @@ static void nilfs_put_super(struct super_block *sb)
 		up_write(&nilfs->ns_sem);
 	}
 
-	put_nilfs(sbi->s_nilfs);
+	destroy_nilfs(nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -836,15 +836,14 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
  * @sb: super_block
  * @data: mount options
  * @silent: silent mode flag
- * @nilfs: the_nilfs struct
  *
  * This function is called exclusively by nilfs->ns_mount_mutex.
  * So, the recovery process is protected from other simultaneous mounts.
  */
 static int
-nilfs_fill_super(struct super_block *sb, void *data, int silent,
-		 struct the_nilfs *nilfs)
+nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
+	struct the_nilfs *nilfs;
 	struct nilfs_sb_info *sbi;
 	struct nilfs_root *fsroot;
 	__u64 cno;
@@ -855,14 +854,18 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->s_super = sb;
 
-	get_nilfs(nilfs);
+	nilfs = alloc_nilfs(sb->s_bdev);
+	if (!nilfs) {
+		err = -ENOMEM;
+		goto failed_sbi;
+	}
 	sbi->s_nilfs = nilfs;
-	sbi->s_super = sb;
 
 	err = init_nilfs(nilfs, sbi, (char *)data);
 	if (err)
-		goto failed_sbi;
+		goto failed_nilfs;
 
 	spin_lock_init(&sbi->s_inode_lock);
 	INIT_LIST_HEAD(&sbi->s_dirty_files);
@@ -885,14 +888,14 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 
 	err = load_nilfs(nilfs, sbi);
 	if (err)
-		goto failed_sbi;
+		goto failed_nilfs;
 
 	cno = nilfs_last_cno(nilfs);
 	err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
 	if (err) {
 		printk(KERN_ERR "NILFS: error loading last checkpoint "
 		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
-		goto failed_sbi;
+		goto failed_nilfs;
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -921,8 +924,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
  failed_checkpoint:
 	nilfs_put_root(fsroot);
 
+ failed_nilfs:
+	destroy_nilfs(nilfs);
+
  failed_sbi:
-	put_nilfs(nilfs);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 	return err;
@@ -1077,7 +1082,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	struct nilfs_super_data sd;
 	struct super_block *s;
 	fmode_t mode = FMODE_READ;
-	struct the_nilfs *nilfs;
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
@@ -1095,18 +1099,10 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	nilfs = find_or_create_nilfs(sd.bdev);
-	if (!nilfs) {
-		err = -ENOMEM;
-		goto failed;
-	}
-
-	mutex_lock(&nilfs->ns_mount_mutex);
-
 	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
-		goto failed_unlock;
+		goto failed;
 	}
 
 	if (!s->s_root) {
@@ -1120,10 +1116,9 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
 
-		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
-				       nilfs);
+		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (err)
-			goto cancel_new;
+			goto failed_super;
 
 		s->s_flags |= MS_ACTIVE;
 	} else if (!sd.cno) {
@@ -1153,17 +1148,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 	if (sd.cno) {
 		err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
-		if (err) {
-			if (s_new)
-				goto cancel_new;
+		if (err)
 			goto failed_super;
-		}
 	} else {
 		root_dentry = dget(s->s_root);
 	}
 
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
 	if (!s_new)
 		close_bdev_exclusive(sd.bdev, mode);
 
@@ -1173,23 +1163,10 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
  failed_super:
 	deactivate_locked_super(s);
- failed_unlock:
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
- failed:
-	close_bdev_exclusive(sd.bdev, mode);
-	return err;
 
- cancel_new:
-	/* Abandoning the newly allocated superblock */
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
-	deactivate_locked_super(s);
-	/*
-	 * This deactivate_locked_super() invokes close_bdev_exclusive().
-	 * We must finish all post-cleaning before this call;
-	 * put_nilfs() needs the block device.
-	 */
+ failed:
+	if (!s_new)
+		close_bdev_exclusive(sd.bdev, mode);
 	return err;
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 960c28797bb2..6eeb4f072f83 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,9 +35,6 @@
 #include "segbuf.h"
 
 
-static LIST_HEAD(nilfs_objects);
-static DEFINE_SPINLOCK(nilfs_lock);
-
 static int nilfs_valid_sb(struct nilfs_super_block *sbp);
 
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
@@ -61,16 +58,13 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 }
 
 /**
- * alloc_nilfs - allocate the_nilfs structure
+ * alloc_nilfs - allocate a nilfs object
  * @bdev: block device to which the_nilfs is related
  *
- * alloc_nilfs() allocates memory for the_nilfs and
- * initializes its reference count and locks.
- *
  * Return Value: On success, pointer to the_nilfs is returned.
  * On error, NULL is returned.
  */
-static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
 	struct the_nilfs *nilfs;
 
@@ -79,12 +73,9 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 		return NULL;
 
 	nilfs->ns_bdev = bdev;
-	atomic_set(&nilfs->ns_count, 1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
-	mutex_init(&nilfs->ns_mount_mutex);
 	init_rwsem(&nilfs->ns_writer_sem);
-	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_cptree = RB_ROOT;
@@ -95,67 +86,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 }
 
 /**
- * find_or_create_nilfs - find or create nilfs object
- * @bdev: block device to which the_nilfs is related
- *
- * find_nilfs() looks up an existent nilfs object created on the
- * device and gets the reference count of the object.  If no nilfs object
- * is found on the device, a new nilfs object is allocated.
- *
- * Return Value: On success, pointer to the nilfs object is returned.
- * On error, NULL is returned.
- */
-struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
-{
-	struct the_nilfs *nilfs, *new = NULL;
-
- retry:
-	spin_lock(&nilfs_lock);
-	list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
-		if (nilfs->ns_bdev == bdev) {
-			get_nilfs(nilfs);
-			spin_unlock(&nilfs_lock);
-			if (new)
-				put_nilfs(new);
-			return nilfs; /* existing object */
-		}
-	}
-	if (new) {
-		list_add_tail(&new->ns_list, &nilfs_objects);
-		spin_unlock(&nilfs_lock);
-		return new; /* new object */
-	}
-	spin_unlock(&nilfs_lock);
-
-	new = alloc_nilfs(bdev);
-	if (new)
-		goto retry;
-	return NULL; /* insufficient memory */
-}
-
-/**
- * put_nilfs - release a reference to the_nilfs
- * @nilfs: the_nilfs structure to be released
- *
- * put_nilfs() decrements a reference counter of the_nilfs.
- * If the reference count reaches zero, the_nilfs is freed.
+ * destroy_nilfs - destroy nilfs object
+ * @nilfs: nilfs object to be released
  */
-void put_nilfs(struct the_nilfs *nilfs)
+void destroy_nilfs(struct the_nilfs *nilfs)
 {
-	spin_lock(&nilfs_lock);
-	if (!atomic_dec_and_test(&nilfs->ns_count)) {
-		spin_unlock(&nilfs_lock);
-		return;
-	}
-	list_del_init(&nilfs->ns_list);
-	spin_unlock(&nilfs_lock);
-
-	/*
-	 * Increment of ns_count never occurs below because the caller
-	 * of get_nilfs() holds at least one reference to the_nilfs.
-	 * Thus its exclusion control is not required here.
-	 */
-
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
 		nilfs_mdt_destroy(nilfs->ns_sufile);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 7b43693a69c5..1908dc7bbd8f 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -46,13 +46,10 @@ enum {
 /**
  * struct the_nilfs - struct to supervise multiple nilfs mount points
  * @ns_flags: flags
- * @ns_count: reference count
- * @ns_list: list head for nilfs_list
  * @ns_bdev: block device
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
- * @ns_mount_mutex: mutex protecting mount process of nilfs
  * @ns_writer_sem: semaphore protecting ns_writer attach/detach
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
@@ -94,14 +91,11 @@ enum {
  */
 struct the_nilfs {
 	unsigned long		ns_flags;
-	atomic_t		ns_count;
-	struct list_head	ns_list;
 
 	struct block_device    *ns_bdev;
 	struct backing_dev_info *ns_bdi;
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
-	struct mutex		ns_mount_mutex;
 	struct rw_semaphore	ns_writer_sem;
 
 	/*
@@ -239,8 +233,8 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 }
 
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *find_or_create_nilfs(struct block_device *);
-void put_nilfs(struct the_nilfs *);
+struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+void destroy_nilfs(struct the_nilfs *nilfs);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
@@ -256,12 +250,6 @@ void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
 
 
-static inline void get_nilfs(struct the_nilfs *nilfs)
-{
-	/* Caller must have at least one reference of the_nilfs. */
-	atomic_inc(&nilfs->ns_count);
-}
-
 static inline void nilfs_get_root(struct nilfs_root *root)
 {
 	atomic_inc(&root->count);
-- 
cgit v1.2.3


From 518d1a6a1d6610a014875a2fe3252e89b6260d2f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 20 Aug 2010 23:40:54 +0900
Subject: nilfs2: allow nilfs_clear_inode to clear metadata file inodes

Allows clear inode function (nilfs_clear_inode) to handle metadata
files that uses bitmap-based object alloctor.  DAT and ifile
correspond to this.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 3efef0ecfa24..5485dd12da64 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -674,6 +674,7 @@ void nilfs_truncate(struct inode *inode)
 static void nilfs_clear_inode(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
 	/*
 	 * Free resources allocated in nilfs_read_inode(), here.
@@ -682,6 +683,9 @@ static void nilfs_clear_inode(struct inode *inode)
 	brelse(ii->i_bh);
 	ii->i_bh = NULL;
 
+	if (mdi && mdi->mi_palloc_cache)
+		nilfs_palloc_destroy_cache(inode);
+
 	if (test_bit(NILFS_I_BMAP, &ii->i_state))
 		nilfs_bmap_clear(ii->i_bmap);
 
-- 
cgit v1.2.3


From adbb39b5485b72dca963a2bc9b1b22bfc19d4967 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 5 Sep 2010 10:14:43 +0900
Subject: nilfs2: do not allocate nilfs_mdt_info structure to gc-inodes

GC-inode now doesn't need the nilfs_mdt_info structure and there is no
reason that it is a sort of metadata files.

This stops the allocation and makes them not dependent on metadata
file routines.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/gcinode.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 34f8f84a22e3..33ad25ddd5c4 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -168,7 +168,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 		}
 		nilfs_btnode_mark_dirty(bh);
 	} else {
-		nilfs_mdt_mark_buffer_dirty(bh);
+		nilfs_mark_buffer_dirty(bh);
 	}
 	return 0;
 }
@@ -177,24 +177,24 @@ int nilfs_init_gcinode(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
-	int ret;
 
-	ret = nilfs_mdt_init(inode, nilfs, GFP_NOFS, 0);
-	if (!ret) {
-		inode->i_mapping->a_ops = &def_gcinode_aops;
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	inode->i_mapping->a_ops = &def_gcinode_aops;
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
-		ii->i_flags = 0;
-		nilfs_bmap_init_gc(ii->i_bmap);
+	ii->i_flags = 0;
+	nilfs_bmap_init_gc(ii->i_bmap);
 
-		/*
-		 * Add the inode to GC inode list. Garbage Collection
-		 * is serialized and no two processes manipulate the
-		 * list simultaneously.
-		 */
-		igrab(inode);
-		list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-	}
-	return ret;
+	/*
+	 * Add the inode to GC inode list. Garbage Collection
+	 * is serialized and no two processes manipulate the
+	 * list simultaneously.
+	 */
+	igrab(inode);
+	list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
+
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From a8070dd365dd995f6139a2fc3aeee10159bdcc45 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 30 Aug 2010 23:42:18 +0900
Subject: nilfs2: add routines to save and restore bmap state

This adds routines to save and restore the state of bmap structure.
The bmap state is stored in a given nilfs_bmap_store object.

These routines will be used to roll back the state of dat inode
without using gcdat inode.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c | 18 ++++++++++++++++++
 fs/nilfs2/bmap.h |  8 ++++++++
 2 files changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3dbdc1d356bf..00244402d59e 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -548,3 +548,21 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
 	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
 }
+
+void nilfs_bmap_save(const struct nilfs_bmap *bmap,
+		     struct nilfs_bmap_store *store)
+{
+	memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
+	store->last_allocated_key = bmap->b_last_allocated_key;
+	store->last_allocated_ptr = bmap->b_last_allocated_ptr;
+	store->state = bmap->b_state;
+}
+
+void nilfs_bmap_restore(struct nilfs_bmap *bmap,
+			const struct nilfs_bmap_store *store)
+{
+	memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
+	bmap->b_last_allocated_key = store->last_allocated_key;
+	bmap->b_last_allocated_ptr = store->last_allocated_ptr;
+	bmap->b_state = store->state;
+}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a20569b19929..5f3339e3eac9 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -135,6 +135,12 @@ struct nilfs_bmap {
 /* state */
 #define NILFS_BMAP_DIRTY	0x00000001
 
+struct nilfs_bmap_store {
+	__le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	__u64 last_allocated_key;
+	__u64 last_allocated_ptr;
+	int state;
+};
 
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -156,6 +162,8 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *);
 void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 
+void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
+void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
 
 static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
 				    __u64 *ptr)
-- 
cgit v1.2.3


From ebdfed4dc59d177cf26013a0c9b8ee9652e9a140 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 6 Sep 2010 12:05:43 +0900
Subject: nilfs2: add routines to roll back state of DAT file

This adds optional function to metadata files which makes a copy of
bmap, page caches, and b-tree node cache, and rolls back to the copy
as needed.

This enhancement is intended to displace gcdat inode that provides a
similar function in a different way.

In this patch, nilfs_shadow_map structure is added to store a copy of
the foregoing states.  nilfs_mdt_setup_shadow_map relates this
structure to a metadata file.  And, nilfs_mdt_save_to_shadow_map() and
nilfs_mdt_restore_from_shadow_map() provides save and restore
functions respectively.  Finally, nilfs_mdt_clear_shadow_map() clears
states of nilfs_shadow_map.

The copy of b-tree node cache and page cache is made by duplicating
only dirty pages into corresponding caches in nilfs_shadow_map.  Their
restoration is done by clearing dirty pages from original caches and
by copying dirty pages back from nilfs_shadow_map.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btnode.c |  17 ++-------
 fs/nilfs2/mdt.c    | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/nilfs2/mdt.h    |  14 ++++++++
 fs/nilfs2/page.c   |  25 +++++++++++++
 fs/nilfs2/page.h   |   4 +++
 5 files changed, 145 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f78ab1044d1d..5115814cb745 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
 
 void nilfs_btnode_cache_init_once(struct address_space *btnc)
 {
-	memset(btnc, 0, sizeof(*btnc));
-	INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
-	spin_lock_init(&btnc->tree_lock);
-	INIT_LIST_HEAD(&btnc->private_list);
-	spin_lock_init(&btnc->private_lock);
-
-	spin_lock_init(&btnc->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
-	INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
+	nilfs_mapping_init_once(btnc);
 }
 
 static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
 void nilfs_btnode_cache_init(struct address_space *btnc,
 			     struct backing_dev_info *bdi)
 {
-	btnc->host = NULL;  /* can safely set to host inode ? */
-	btnc->flags = 0;
-	mapping_set_gfp_mask(btnc, GFP_NOFS);
-	btnc->assoc_mapping = NULL;
-	btnc->backing_dev_info = bdi;
-	btnc->a_ops = &def_btnode_aops;
+	nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
 }
 
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 73e5da3b097e..0066468609da 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -398,16 +398,22 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
 static int
 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 {
-	struct inode *inode = container_of(page->mapping,
-					   struct inode, i_data);
-	struct super_block *sb = inode->i_sb;
-	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
+	struct inode *inode;
+	struct super_block *sb;
+	struct the_nilfs *nilfs;
 	struct nilfs_sb_info *writer = NULL;
 	int err = 0;
 
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 
+	inode = page->mapping->host;
+	if (!inode)
+		return 0;
+
+	sb = inode->i_sb;
+	nilfs = NILFS_MDT(inode)->mi_nilfs;
+
 	if (page->mapping->assoc_mapping)
 		return 0; /* Do not request flush for shadow page cache */
 	if (!sb) {
@@ -567,6 +573,96 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
 		&NILFS_I(orig)->i_btnode_cache;
 }
 
+static const struct address_space_operations shadow_map_aops = {
+	.sync_page		= block_sync_page,
+};
+
+/**
+ * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
+ * @inode: inode of the metadata file
+ * @shadow: shadow mapping
+ */
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct backing_dev_info *bdi = NILFS_I_NILFS(inode)->ns_bdi;
+
+	INIT_LIST_HEAD(&shadow->frozen_buffers);
+	nilfs_mapping_init_once(&shadow->frozen_data);
+	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+	mi->mi_shadow = shadow;
+	return 0;
+}
+
+/**
+ * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
+ * @inode: inode of the metadata file
+ */
+int nilfs_mdt_save_to_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+	int ret;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+	if (ret)
+		goto out;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
+				     &ii->i_btnode_cache);
+	if (ret)
+		goto out;
+
+	nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
+ out:
+	return ret;
+}
+
+/**
+ * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
+
+	if (mi->mi_palloc_cache)
+		nilfs_palloc_clear_cache(inode);
+
+	nilfs_clear_dirty_pages(inode->i_mapping);
+	nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+
+	nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+	nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+
+	nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
+
+	up_write(&mi->mi_sem);
+}
+
+/**
+ * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
+	truncate_inode_pages(&shadow->frozen_data, 0);
+	truncate_inode_pages(&shadow->frozen_btnodes, 0);
+	up_write(&mi->mi_sem);
+}
+
 static void nilfs_mdt_clear(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index f44560224bd1..e7f0d158c527 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,6 +28,13 @@
 #include "nilfs.h"
 #include "page.h"
 
+struct nilfs_shadow_map {
+	struct nilfs_bmap_store bmap_store;
+	struct address_space frozen_data;
+	struct address_space frozen_btnodes;
+	struct list_head frozen_buffers;
+};
+
 /**
  * struct nilfs_mdt_info - on-memory private data of meta data files
  * @mi_nilfs: back pointer to the_nilfs struct
@@ -37,6 +44,7 @@
  * @mi_first_entry_offset: offset to the first entry
  * @mi_entries_per_block: number of entries in a block
  * @mi_palloc_cache: persistent object allocator cache
+ * @mi_shadow: shadow of bmap and page caches
  * @mi_blocks_per_group: number of blocks in a group
  * @mi_blocks_per_desc_block: number of blocks per descriptor block
  */
@@ -48,6 +56,7 @@ struct nilfs_mdt_info {
 	unsigned		mi_first_entry_offset;
 	unsigned long		mi_entries_per_block;
 	struct nilfs_palloc_cache *mi_palloc_cache;
+	struct nilfs_shadow_map *mi_shadow;
 	unsigned long		mi_blocks_per_group;
 	unsigned long		mi_blocks_per_desc_block;
 };
@@ -86,6 +95,11 @@ void nilfs_mdt_destroy(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 void nilfs_mdt_set_shadow(struct inode *, struct inode *);
 
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow);
+int nilfs_mdt_save_to_shadow_map(struct inode *inode);
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
+void nilfs_mdt_clear_shadow_map(struct inode *inode);
 
 #define nilfs_mdt_mark_buffer_dirty(bh)	nilfs_mark_buffer_dirty(bh)
 
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index aab11db2cb08..6384ac14c0c8 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -513,6 +513,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	}
 	return nc;
 }
+ 
+void nilfs_mapping_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+
+void nilfs_mapping_init(struct address_space *mapping,
+			struct backing_dev_info *bdi,
+			const struct address_space_operations *aops)
+{
+	mapping->host = NULL;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->assoc_mapping = NULL;
+	mapping->backing_dev_info = bdi;
+	mapping->a_ops = aops;
+}
 
 /*
  * NILFS2 needs clear_page_dirty() in the following two cases:
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f53d8da41ed7..6ec4f498fc2b 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -59,6 +59,10 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_mapping_init_once(struct address_space *mapping);
+void nilfs_mapping_init(struct address_space *mapping,
+			struct backing_dev_info *bdi,
+			const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 
 #define NILFS_PAGE_BUG(page, m, a...) \
-- 
cgit v1.2.3


From b1f6a4f294088b3fcf9ae67915ca550a1ded2819 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Tue, 31 Aug 2010 11:40:34 +0900
Subject: nilfs2: add routines to redirect access to buffers of DAT file

During garbage collection (GC), DAT file, which converts virtual block
number to real block number, may return disk block number that is not
yet written to the device.

To avoid access to unwritten blocks, the current implementation stores
changes to the caches of GCDAT during GC and atomically commit the
changes into the DAT file after they are written to the device.

This patch, instead, adds a function that makes a copy of specified
buffer and stores it in nilfs_shadow_map, and a function to get the
backup copy as needed (nilfs_mdt_freeze_buffer and
nilfs_mdt_get_frozen_buffer respectively).

Before DAT changes block number in an entry block, it makes a copy and
redirect access to the buffer so that address conversion function
(i.e. nilfs_dat_translate) refers to the old address saved in the
copy.

This patch gives requisites for such redirection.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c     | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/mdt.h     |  3 +++
 fs/nilfs2/page.c    |  2 ++
 fs/nilfs2/page.h    |  2 ++
 fs/nilfs2/segment.c |  1 +
 5 files changed, 75 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 0066468609da..532f85acf273 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -622,6 +622,72 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode)
 	return ret;
 }
 
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen;
+	struct page *page;
+	int blkbits = inode->i_blkbits;
+	int ret = -ENOMEM;
+
+	page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+	if (!page)
+		return ret;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, 0);
+
+	bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+	if (bh_frozen) {
+		if (!buffer_uptodate(bh_frozen))
+			nilfs_copy_buffer(bh_frozen, bh);
+		if (list_empty(&bh_frozen->b_assoc_buffers)) {
+			list_add_tail(&bh_frozen->b_assoc_buffers,
+				      &shadow->frozen_buffers);
+			set_buffer_nilfs_redirected(bh);
+		} else {
+			brelse(bh_frozen); /* already frozen */
+		}
+		ret = 0;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+struct buffer_head *
+nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen = NULL;
+	struct page *page;
+	int n;
+
+	page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+	if (page) {
+		if (page_has_buffers(page)) {
+			n = bh_offset(bh) >> inode->i_blkbits;
+			bh_frozen = nilfs_page_get_nth_block(page, n);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return bh_frozen;
+}
+
+static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
+{
+	struct list_head *head = &shadow->frozen_buffers;
+	struct buffer_head *bh;
+
+	while (!list_empty(head)) {
+		bh = list_first_entry(head, struct buffer_head,
+				      b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh); /* drop ref-count to make it releasable */
+	}
+}
+
 /**
  * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
  * @inode: inode of the metadata file
@@ -658,6 +724,7 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode)
 	struct nilfs_shadow_map *shadow = mi->mi_shadow;
 
 	down_write(&mi->mi_sem);
+	nilfs_release_frozen_buffers(shadow);
 	truncate_inode_pages(&shadow->frozen_data, 0);
 	truncate_inode_pages(&shadow->frozen_btnodes, 0);
 	up_write(&mi->mi_sem);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index e7f0d158c527..e60bbfe899f1 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -100,6 +100,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 int nilfs_mdt_save_to_shadow_map(struct inode *inode);
 void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
 void nilfs_mdt_clear_shadow_map(struct inode *inode);
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
+struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
+						struct buffer_head *bh);
 
 #define nilfs_mdt_mark_buffer_dirty(bh)	nilfs_mark_buffer_dirty(bh)
 
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 6384ac14c0c8..7083344ac881 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -131,6 +131,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
 	lock_buffer(bh);
 	clear_buffer_nilfs_volatile(bh);
 	clear_buffer_nilfs_checked(bh);
+	clear_buffer_nilfs_redirected(bh);
 	clear_buffer_dirty(bh);
 	if (nilfs_page_buffers_clean(page))
 		__nilfs_clear_page_dirty(page);
@@ -483,6 +484,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
 				clear_buffer_dirty(bh);
 				clear_buffer_nilfs_volatile(bh);
 				clear_buffer_nilfs_checked(bh);
+				clear_buffer_nilfs_redirected(bh);
 				clear_buffer_uptodate(bh);
 				clear_buffer_mapped(bh);
 				unlock_buffer(bh);
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 6ec4f498fc2b..fb9e8a8a2038 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -35,12 +35,14 @@ enum {
 	BH_NILFS_Node,
 	BH_NILFS_Volatile,
 	BH_NILFS_Checked,
+	BH_NILFS_Redirected,
 };
 
 BUFFER_FNS(NILFS_Allocated, nilfs_allocated)	/* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
 BUFFER_FNS(NILFS_Checked, nilfs_checked)	/* buffer is verified */
+BUFFER_FNS(NILFS_Redirected, nilfs_redirected)	/* redirected to a copy */
 
 
 void nilfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b75306d642c2..91dc0668ec83 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1908,6 +1908,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 			set_buffer_uptodate(bh);
 			clear_buffer_dirty(bh);
 			clear_buffer_nilfs_volatile(bh);
+			clear_buffer_nilfs_redirected(bh);
 			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
-- 
cgit v1.2.3


From c1c1d7092072093ad960db2f6c08f06705c57fa4 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 29 Aug 2010 12:44:56 +0900
Subject: nilfs2: get rid of GCDAT inode

This applies prepared rollback function and redirect function of
metadata file to DAT file, and eliminates GCDAT inode.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/Makefile    |  2 +-
 fs/nilfs2/bmap.c      | 16 ----------
 fs/nilfs2/bmap.h      |  2 --
 fs/nilfs2/dat.c       | 30 +++++++++++++++++-
 fs/nilfs2/gcdat.c     | 87 ---------------------------------------------------
 fs/nilfs2/mdt.c       |  9 ------
 fs/nilfs2/mdt.h       |  1 -
 fs/nilfs2/nilfs.h     |  8 +----
 fs/nilfs2/page.c      | 28 ++---------------
 fs/nilfs2/segment.c   | 14 ++++-----
 fs/nilfs2/the_nilfs.c | 13 +-------
 fs/nilfs2/the_nilfs.h |  2 --
 12 files changed, 41 insertions(+), 171 deletions(-)
 delete mode 100644 fs/nilfs2/gcdat.c

(limited to 'fs')

diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc5..85c98737a146 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
 nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
 	btnode.o bmap.o btree.o direct.o dat.o recovery.o \
 	the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
-	ifile.o alloc.o gcinode.o ioctl.o gcdat.o
+	ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 00244402d59e..8b782b062baa 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -533,22 +533,6 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
 	nilfs_btree_init_gc(bmap);
 }
 
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
-{
-	memcpy(gcbmap, bmap, sizeof(*bmap));
-	init_rwsem(&gcbmap->b_sem);
-	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
-	gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
-}
-
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
-{
-	memcpy(bmap, gcbmap, sizeof(*bmap));
-	init_rwsem(&bmap->b_sem);
-	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
-	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
-}
-
 void nilfs_bmap_save(const struct nilfs_bmap *bmap,
 		     struct nilfs_bmap_store *store)
 {
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 5f3339e3eac9..bde1c0aa2e15 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -159,8 +159,6 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
 int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
 
 void nilfs_bmap_init_gc(struct nilfs_bmap *);
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 
 void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
 void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 013146755683..7091c4e0f042 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
 struct nilfs_dat_info {
 	struct nilfs_mdt_info mi;
 	struct nilfs_palloc_cache palloc_cache;
+	struct nilfs_shadow_map shadow;
 };
 
 static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -327,6 +328,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
 	if (ret < 0)
 		return ret;
+
+	/*
+	 * The given disk block number (blocknr) is not yet written to
+	 * the device at this point.
+	 *
+	 * To prevent nilfs_dat_translate() from returning the
+	 * uncommited block number, this makes a copy of the entry
+	 * buffer and redirects nilfs_dat_translate() to the copy.
+	 */
+	if (!buffer_nilfs_redirected(entry_bh)) {
+		ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
+		if (ret) {
+			brelse(entry_bh);
+			return ret;
+		}
+	}
+
 	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +389,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
  */
 int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 {
-	struct buffer_head *entry_bh;
+	struct buffer_head *entry_bh, *bh;
 	struct nilfs_dat_entry *entry;
 	sector_t blocknr;
 	void *kaddr;
@@ -381,6 +399,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	if (ret < 0)
 		return ret;
 
+	if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
+		bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
+		if (bh) {
+			WARN_ON(!buffer_uptodate(bh));
+			brelse(entry_bh);
+			entry_bh = bh;
+		}
+	}
+
 	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	blocknr = le64_to_cpu(entry->de_blocknr);
@@ -468,6 +495,7 @@ struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
 		di = NILFS_DAT_I(dat);
 		lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
 		nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+		nilfs_mdt_setup_shadow_map(dat, &di->shadow);
 	}
 	return dat;
 }
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index 84a45d1d5464..000000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * gcdat.c - NILFS shadow DAT inode for GC
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- *            and Ryusuke Konishi <ryusuke@osrg.net>.
- *
- */
-
-#include <linux/buffer_head.h>
-#include "nilfs.h"
-#include "page.h"
-#include "mdt.h"
-
-int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
-{
-	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-	int err;
-
-	gcdat->i_state = 0;
-	gcdat->i_blocks = dat->i_blocks;
-	gii->i_flags = dii->i_flags;
-	gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
-	gii->i_cno = 0;
-	nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
-	err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
-	if (unlikely(err))
-		return err;
-
-	return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
-				      &dii->i_btnode_cache);
-}
-
-void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
-{
-	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-	struct address_space *mapping = dat->i_mapping;
-	struct address_space *gmapping = gcdat->i_mapping;
-
-	down_write(&NILFS_MDT(dat)->mi_sem);
-	dat->i_blocks = gcdat->i_blocks;
-	dii->i_flags = gii->i_flags;
-	dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
-
-	nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
-
-	nilfs_palloc_clear_cache(dat);
-	nilfs_palloc_clear_cache(gcdat);
-	nilfs_clear_dirty_pages(mapping);
-	nilfs_copy_back_pages(mapping, gmapping);
-	/* note: mdt dirty flags should be cleared by segctor. */
-
-	nilfs_clear_dirty_pages(&dii->i_btnode_cache);
-	nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
-
-	up_write(&NILFS_MDT(dat)->mi_sem);
-}
-
-void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
-{
-	struct inode *gcdat = nilfs->ns_gc_dat;
-	struct nilfs_inode_info *gii = NILFS_I(gcdat);
-
-	gcdat->i_state = I_FREEING | I_CLEAR;
-	gii->i_flags = 0;
-
-	nilfs_palloc_clear_cache(gcdat);
-	truncate_inode_pages(gcdat->i_mapping, 0);
-	truncate_inode_pages(&gii->i_btnode_cache, 0);
-}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 532f85acf273..3bbd340a5136 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -414,8 +414,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 	sb = inode->i_sb;
 	nilfs = NILFS_MDT(inode)->mi_nilfs;
 
-	if (page->mapping->assoc_mapping)
-		return 0; /* Do not request flush for shadow page cache */
 	if (!sb) {
 		down_read(&nilfs->ns_writer_sem);
 		writer = nilfs->ns_writer;
@@ -566,13 +564,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
 	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
 
-void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
-{
-	shadow->i_mapping->assoc_mapping = orig->i_mapping;
-	NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
-		&NILFS_I(orig)->i_btnode_cache;
-}
-
 static const struct address_space_operations shadow_map_aops = {
 	.sync_page		= block_sync_page,
 };
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index e60bbfe899f1..1e0901c3fd6b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -93,7 +93,6 @@ struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
 				   ino_t);
 void nilfs_mdt_destroy(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
-void nilfs_mdt_set_shadow(struct inode *, struct inode *);
 
 int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index cf5507a2a178..e9f457951e32 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -101,7 +101,6 @@ enum {
 	NILFS_I_INODE_DIRTY,		/* write_inode is requested */
 	NILFS_I_BMAP,			/* has bmap and btnode_cache */
 	NILFS_I_GCINODE,		/* inode for GC, on memory only */
-	NILFS_I_GCDAT,			/* shadow DAT, on memory only */
 };
 
 /*
@@ -193,7 +192,7 @@ static inline int nilfs_doing_construction(void)
 
 static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
 {
-	return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+	return nilfs->ns_dat;
 }
 
 /*
@@ -294,11 +293,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
 int nilfs_init_gcinode(struct inode *inode);
 void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
 
-/* gcdat.c */
-int nilfs_init_gcdat_inode(struct the_nilfs *);
-void nilfs_commit_gcdat_inode(struct the_nilfs *);
-void nilfs_clear_gcdat_inode(struct the_nilfs *);
-
 /*
  * Inodes and files operations
  */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 7083344ac881..a6c3c2e817f8 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -79,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 {
 	int blkbits = inode->i_blkbits;
 	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
-	struct page *page, *opage;
-	struct buffer_head *bh, *obh;
+	struct page *page;
+	struct buffer_head *bh;
 
 	page = grab_cache_page(mapping, index);
 	if (unlikely(!page))
@@ -92,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 		page_cache_release(page);
 		return NULL;
 	}
-	if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
-		/*
-		 * Shadow page cache uses assoc_mapping to point its original
-		 * page cache.  The following code tries the original cache
-		 * if the given cache is a shadow and it didn't hit.
-		 */
-		opage = find_lock_page(mapping->assoc_mapping, index);
-		if (!opage)
-			return bh;
-
-		obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
-					     b_state);
-		if (buffer_uptodate(obh)) {
-			nilfs_copy_buffer(bh, obh);
-			if (buffer_dirty(obh)) {
-				nilfs_mark_buffer_dirty(bh);
-				if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
-					nilfs_mdt_mark_dirty(inode);
-			}
-		}
-		brelse(obh);
-		unlock_page(opage);
-		page_cache_release(opage);
-	}
 	return bh;
 }
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 91dc0668ec83..b0c5e08d06c8 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1945,11 +1945,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 
 	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
 
-	if (nilfs_doing_gc()) {
+	if (nilfs_doing_gc())
 		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
-		if (update_sr)
-			nilfs_commit_gcdat_inode(nilfs);
-	} else
+	else
 		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
 
 	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -2472,13 +2470,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 
 	nilfs_transaction_lock(sbi, &ti, 1);
 
-	err = nilfs_init_gcdat_inode(nilfs);
+	err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
 	if (unlikely(err))
 		goto out_unlock;
 
 	err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
-	if (unlikely(err))
+	if (unlikely(err)) {
+		nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
 		goto out_unlock;
+	}
 
 	sci->sc_freesegs = kbufs[4];
 	sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2510,7 +2510,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
  out_unlock:
 	sci->sc_freesegs = NULL;
 	sci->sc_nfreesegs = 0;
-	nilfs_clear_gcdat_inode(nilfs);
+	nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
 	nilfs_transaction_unlock(sbi);
 	return err;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6eeb4f072f83..b7666bc04256 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -96,7 +96,6 @@ void destroy_nilfs(struct the_nilfs *nilfs)
 		nilfs_mdt_destroy(nilfs->ns_sufile);
 		nilfs_mdt_destroy(nilfs->ns_cpfile);
 		nilfs_mdt_destroy(nilfs->ns_dat);
-		nilfs_mdt_destroy(nilfs->ns_gc_dat);
 	}
 	if (nilfs_init(nilfs)) {
 		brelse(nilfs->ns_sbh[0]);
@@ -131,20 +130,14 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
 	if (unlikely(!nilfs->ns_dat))
 		goto failed;
 
-	nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
-	if (unlikely(!nilfs->ns_gc_dat))
-		goto failed_dat;
-
 	nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
 	if (unlikely(!nilfs->ns_cpfile))
-		goto failed_gc_dat;
+		goto failed_dat;
 
 	nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
 	if (unlikely(!nilfs->ns_sufile))
 		goto failed_cpfile;
 
-	nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-
 	err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
 			     NILFS_SR_DAT_OFFSET(inode_size));
 	if (unlikely(err))
@@ -173,9 +166,6 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
  failed_cpfile:
 	nilfs_mdt_destroy(nilfs->ns_cpfile);
 
- failed_gc_dat:
-	nilfs_mdt_destroy(nilfs->ns_gc_dat);
-
  failed_dat:
 	nilfs_mdt_destroy(nilfs->ns_dat);
 	goto failed;
@@ -371,7 +361,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	nilfs_mdt_destroy(nilfs->ns_cpfile);
 	nilfs_mdt_destroy(nilfs->ns_sufile);
 	nilfs_mdt_destroy(nilfs->ns_dat);
-	nilfs_mdt_destroy(nilfs->ns_gc_dat);
 
  failed:
 	nilfs_clear_recovery_info(&ri);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1908dc7bbd8f..a5178dc43dfd 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -74,7 +74,6 @@ enum {
  * @ns_dat: DAT file inode
  * @ns_cpfile: checkpoint file inode
  * @ns_sufile: segusage file inode
- * @ns_gc_dat: shadow inode of the DAT file inode for GC
  * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
  * @ns_cptree_lock: lock protecting @ns_cptree
  * @ns_gc_inodes: dummy inodes to keep live blocks
@@ -149,7 +148,6 @@ struct the_nilfs {
 	struct inode	       *ns_dat;
 	struct inode	       *ns_cpfile;
 	struct inode	       *ns_sufile;
-	struct inode	       *ns_gc_dat;
 
 	/* Checkpoint tree */
 	struct rb_root		ns_cptree;
-- 
cgit v1.2.3


From f1e89c86fdd0f5e59f6768146c86437934202033 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 5 Sep 2010 12:20:59 +0900
Subject: nilfs2: use iget for all metadata files

This makes use of iget5_locked to allocate or get inode for metadata
files to stop using own inode allocator.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/cpfile.c    | 49 +++++++++++++++++++++-------------
 fs/nilfs2/cpfile.h    |  4 +--
 fs/nilfs2/dat.c       | 61 ++++++++++++++++++++++++------------------
 fs/nilfs2/dat.h       |  4 +--
 fs/nilfs2/ifile.c     | 51 ++++++++++++++++++++++++-----------
 fs/nilfs2/ifile.h     |  4 ++-
 fs/nilfs2/inode.c     | 13 ++++++---
 fs/nilfs2/mdt.c       | 16 +++++------
 fs/nilfs2/mdt.h       |  3 +--
 fs/nilfs2/nilfs.h     |  2 ++
 fs/nilfs2/sufile.c    | 73 +++++++++++++++++++++++++++++----------------------
 fs/nilfs2/sufile.h    |  4 +--
 fs/nilfs2/super.c     | 21 ++++++++++-----
 fs/nilfs2/the_nilfs.c | 58 ++++++++++++++--------------------------
 14 files changed, 207 insertions(+), 156 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db63..03de1da8795b 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -933,27 +933,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 }
 
 /**
- * nilfs_cpfile_read - read cpfile inode
- * @cpfile: cpfile inode
- * @raw_inode: on-disk cpfile inode
- */
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
-{
-	return nilfs_read_inode_common(cpfile, raw_inode);
-}
-
-/**
- * nilfs_cpfile_new - create cpfile
- * @nilfs: nilfs object
+ * nilfs_cpfile_read - read or get cpfile inode
+ * @sb: super block instance
  * @cpsize: size of a checkpoint entry
+ * @raw_inode: on-disk cpfile inode
+ * @inodep: buffer to store the inode
  */
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
 	struct inode *cpfile;
+	int err;
 
-	cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
-	if (cpfile)
-		nilfs_mdt_set_entry_size(cpfile, cpsize,
-					 sizeof(struct nilfs_cpfile_header));
-	return cpfile;
+	cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
+	if (unlikely(!cpfile))
+		return -ENOMEM;
+	if (!(cpfile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
+	if (err)
+		goto failed;
+
+	nilfs_mdt_set_entry_size(cpfile, cpsize,
+				 sizeof(struct nilfs_cpfile_header));
+
+	err = nilfs_read_inode_common(cpfile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(cpfile);
+ out:
+	*inodep = cpfile;
+	return 0;
+ failed:
+	iget_failed(cpfile);
+	return err;
 }
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab43..a242b9a314f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
 				size_t);
 
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
 
 #endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7091c4e0f042..ab04a68f425d 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -463,39 +463,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
 }
 
 /**
- * nilfs_dat_read - read dat inode
- * @dat: dat inode
- * @raw_inode: on-disk dat inode
- */
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
-{
-	return nilfs_read_inode_common(dat, raw_inode);
-}
-
-/**
- * nilfs_dat_new - create dat file
- * @nilfs: nilfs object
+ * nilfs_dat_read - read or get dat inode
+ * @sb: super block instance
  * @entry_size: size of a dat entry
+ * @raw_inode: on-disk dat inode
+ * @inodep: buffer to store the inode
  */
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep)
 {
 	static struct lock_class_key dat_lock_key;
 	struct inode *dat;
 	struct nilfs_dat_info *di;
 	int err;
 
-	dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
-	if (dat) {
-		err = nilfs_palloc_init_blockgroup(dat, entry_size);
-		if (unlikely(err)) {
-			nilfs_mdt_destroy(dat);
-			return NULL;
-		}
+	dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
+	if (unlikely(!dat))
+		return -ENOMEM;
+	if (!(dat->i_state & I_NEW))
+		goto out;
 
-		di = NILFS_DAT_I(dat);
-		lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
-		nilfs_palloc_setup_cache(dat, &di->palloc_cache);
-		nilfs_mdt_setup_shadow_map(dat, &di->shadow);
-	}
-	return dat;
+	err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(dat, entry_size);
+	if (err)
+		goto failed;
+
+	di = NILFS_DAT_I(dat);
+	lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+	nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+	nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+
+	err = nilfs_read_inode_common(dat, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(dat);
+ out:
+	*inodep = dat;
+	return 0;
+ failed:
+	iget_failed(dat);
+	return err;
 }
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0efe..cbd8e9732503 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep);
 
 #endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8f..9f8a2da67f90 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -161,25 +161,46 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 }
 
 /**
- * nilfs_ifile_new - create inode file
- * @sbi: nilfs_sb_info struct
+ * nilfs_ifile_read - read or get ifile inode
+ * @sb: super block instance
+ * @root: root object
  * @inode_size: size of an inode
+ * @raw_inode: on-disk ifile inode
+ * @inodep: buffer to store the inode
  */
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep)
 {
 	struct inode *ifile;
 	int err;
 
-	ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
-			      sizeof(struct nilfs_ifile_info));
-	if (ifile) {
-		err = nilfs_palloc_init_blockgroup(ifile, inode_size);
-		if (unlikely(err)) {
-			nilfs_mdt_destroy(ifile);
-			return NULL;
-		}
-		nilfs_palloc_setup_cache(ifile,
-					 &NILFS_IFILE_I(ifile)->palloc_cache);
-	}
-	return ifile;
+	ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
+	if (unlikely(!ifile))
+		return -ENOMEM;
+	if (!(ifile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
+			     sizeof(struct nilfs_ifile_info));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+	if (err)
+		goto failed;
+
+	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
+
+	err = nilfs_read_inode_common(ifile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(ifile);
+ out:
+	*inodep = ifile;
+	return 0;
+ failed:
+	iget_failed(ifile);
+	return err;
 }
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f2..59b6f2b51df6 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep);
 
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5485dd12da64..5b3d43fb4e12 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -506,16 +506,23 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
 	return 0;
 }
 
-struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
-			 unsigned long ino)
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino)
 {
 	struct nilfs_iget_args args = {
 		.ino = ino, .root = root, .cno = 0, .for_gc = 0
 	};
+
+	return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+}
+
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino)
+{
 	struct inode *inode;
 	int err;
 
-	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+	inode = nilfs_iget_locked(sb, root, ino);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3bbd340a5136..44326cfe1fa9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -444,8 +444,7 @@ static const struct inode_operations def_mdt_iops;
 static const struct file_operations def_mdt_fops;
 
 
-int nilfs_mdt_init(struct inode *inode, struct the_nilfs *nilfs,
-		   gfp_t gfp_mask, size_t objsz)
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 {
 	struct nilfs_mdt_info *mi;
 
@@ -453,13 +452,17 @@ int nilfs_mdt_init(struct inode *inode, struct the_nilfs *nilfs,
 	if (!mi)
 		return -ENOMEM;
 
-	mi->mi_nilfs = nilfs;
+	mi->mi_nilfs = NILFS_I_NILFS(inode);
 	init_rwsem(&mi->mi_sem);
 	inode->i_private = mi;
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-	inode->i_mapping->backing_dev_info = nilfs->ns_bdi;
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
+
+	inode->i_op = &def_mdt_iops;
+	inode->i_fop = &def_mdt_fops;
+	inode->i_mapping->a_ops = &def_mdt_aops;
 
 	return 0;
 }
@@ -544,13 +547,10 @@ struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
 	if (!inode)
 		return NULL;
 
-	if (nilfs_mdt_init(inode, nilfs, NILFS_MDT_GFP, objsz) < 0) {
+	if (nilfs_mdt_init(inode, NILFS_MDT_GFP, objsz) < 0) {
 		nilfs_destroy_inode(inode);
 		return NULL;
 	}
-	inode->i_op = &def_mdt_iops;
-	inode->i_fop = &def_mdt_fops;
-	inode->i_mapping->a_ops = &def_mdt_aops;
 	return inode;
 }
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 1e0901c3fd6b..73ff7c055715 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -85,8 +85,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
-int nilfs_mdt_init(struct inode *inode, struct the_nilfs *nilfs,
-		   gfp_t gfp_mask, size_t objsz);
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
 struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
 			    size_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index e9f457951e32..2ca2ca5ca848 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -244,6 +244,8 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino);
 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
 			 unsigned long ino);
 extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2e..599d9c27761e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 }
 
 /**
- * nilfs_sufile_read - read sufile inode
- * @sufile: sufile inode
+ * nilfs_sufile_read - read or get sufile inode
+ * @sb: super block instance
+ * @susize: size of a segment usage entry
  * @raw_inode: on-disk sufile inode
+ * @inodep: buffer to store the inode
  */
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
-	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	struct inode *sufile;
+	struct nilfs_sufile_info *sui;
 	struct buffer_head *header_bh;
 	struct nilfs_sufile_header *header;
 	void *kaddr;
-	int ret;
+	int err;
 
-	ret = nilfs_read_inode_common(sufile, raw_inode);
-	if (ret < 0)
-		return ret;
+	sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
+	if (unlikely(!sufile))
+		return -ENOMEM;
+	if (!(sufile->i_state & I_NEW))
+		goto out;
 
-	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-	if (!ret) {
-		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-		header = kaddr + bh_offset(header_bh);
-		sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-		kunmap_atomic(kaddr, KM_USER0);
-		brelse(header_bh);
-	}
-	return ret;
-}
+	err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
+	if (err)
+		goto failed;
 
-/**
- * nilfs_sufile_new - create sufile
- * @nilfs: nilfs object
- * @susize: size of a segment usage entry
- */
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
-{
-	struct inode *sufile;
+	nilfs_mdt_set_entry_size(sufile, susize,
+				 sizeof(struct nilfs_sufile_header));
+
+	err = nilfs_read_inode_common(sufile, raw_inode);
+	if (err)
+		goto failed;
+
+	err = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (err)
+		goto failed;
 
-	sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
-			       sizeof(struct nilfs_sufile_info));
-	if (sufile)
-		nilfs_mdt_set_entry_size(sufile, susize,
-					 sizeof(struct nilfs_sufile_header));
-	return sufile;
+	sui = NILFS_SUI(sufile);
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(header_bh);
+
+	unlock_new_inode(sufile);
+ out:
+	*inodep = sufile;
+	return 0;
+ failed:
+	iget_failed(sufile);
+	return err;
 }
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7d..203f6102a62f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 			       struct buffer_head *);
 
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
 
 /**
  * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5893cb27c909..39e7d7f8eda0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -356,6 +356,10 @@ static void nilfs_put_super(struct super_block *sb)
 		up_write(&nilfs->ns_sem);
 	}
 
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
 	destroy_nilfs(nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
@@ -403,10 +407,6 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 	if (root->ifile)
 		goto reuse; /* already attached checkpoint */
 
-	root->ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
-	if (!root->ifile)
-		goto failed;
-
 	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
 					  &bh_cp);
@@ -421,8 +421,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 		}
 		goto failed;
 	}
-	err = nilfs_read_inode_common(root->ifile, &raw_cp->cp_ifile_inode);
-	if (unlikely(err))
+
+	err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+			       &raw_cp->cp_ifile_inode, &root->ifile);
+	if (err)
 		goto failed_bh;
 
 	atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
@@ -895,7 +897,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (err) {
 		printk(KERN_ERR "NILFS: error loading last checkpoint "
 		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
-		goto failed_nilfs;
+		goto failed_unload;
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -924,6 +926,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
  failed_checkpoint:
 	nilfs_put_root(fsroot);
 
+ failed_unload:
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
  failed_nilfs:
 	destroy_nilfs(nilfs);
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index b7666bc04256..4d6763e28eb5 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -92,11 +92,6 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 void destroy_nilfs(struct the_nilfs *nilfs)
 {
 	might_sleep();
-	if (nilfs_loaded(nilfs)) {
-		nilfs_mdt_destroy(nilfs->ns_sufile);
-		nilfs_mdt_destroy(nilfs->ns_cpfile);
-		nilfs_mdt_destroy(nilfs->ns_dat);
-	}
 	if (nilfs_init(nilfs)) {
 		brelse(nilfs->ns_sbh[0]);
 		brelse(nilfs->ns_sbh[1]);
@@ -104,11 +99,13 @@ void destroy_nilfs(struct the_nilfs *nilfs)
 	kfree(nilfs);
 }
 
-static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+				 struct super_block *sb, sector_t sr_block)
 {
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct nilfs_inode *rawi;
 	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
 	unsigned inode_size;
 	int err;
@@ -125,34 +122,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
 
 	inode_size = nilfs->ns_inode_size;
 
-	err = -ENOMEM;
-	nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
-	if (unlikely(!nilfs->ns_dat))
+	rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
+	err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
+	if (err)
 		goto failed;
 
-	nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
-	if (unlikely(!nilfs->ns_cpfile))
+	rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
+	err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
+	if (err)
 		goto failed_dat;
 
-	nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
-	if (unlikely(!nilfs->ns_sufile))
+	rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
+	err = nilfs_sufile_read(sb, segment_usage_size, rawi,
+				&nilfs->ns_sufile);
+	if (err)
 		goto failed_cpfile;
 
-	err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
-			     NILFS_SR_DAT_OFFSET(inode_size));
-	if (unlikely(err))
-		goto failed_sufile;
-
-	err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
-				NILFS_SR_CPFILE_OFFSET(inode_size));
-	if (unlikely(err))
-		goto failed_sufile;
-
-	err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
-				NILFS_SR_SUFILE_OFFSET(inode_size));
-	if (unlikely(err))
-		goto failed_sufile;
-
 	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
 	nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
 
@@ -160,14 +145,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
 	brelse(bh_sr);
 	return err;
 
- failed_sufile:
-	nilfs_mdt_destroy(nilfs->ns_sufile);
-
  failed_cpfile:
-	nilfs_mdt_destroy(nilfs->ns_cpfile);
+	iput(nilfs->ns_cpfile);
 
  failed_dat:
-	nilfs_mdt_destroy(nilfs->ns_dat);
+	iput(nilfs->ns_dat);
 	goto failed;
 }
 
@@ -290,7 +272,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 			goto scan_error;
 	}
 
-	err = nilfs_load_super_root(nilfs, ri.ri_super_root);
+	err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
 	if (unlikely(err)) {
 		printk(KERN_ERR "NILFS: error loading super root.\n");
 		goto failed;
@@ -358,9 +340,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	goto failed;
 
  failed_unload:
-	nilfs_mdt_destroy(nilfs->ns_cpfile);
-	nilfs_mdt_destroy(nilfs->ns_sufile);
-	nilfs_mdt_destroy(nilfs->ns_dat);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_dat);
 
  failed:
 	nilfs_clear_recovery_info(&ri);
@@ -782,7 +764,7 @@ void nilfs_put_root(struct nilfs_root *root)
 		rb_erase(&root->rb_node, &nilfs->ns_cptree);
 		spin_unlock(&nilfs->ns_cptree_lock);
 		if (root->ifile)
-			nilfs_mdt_destroy(root->ifile);
+			iput(root->ifile);
 
 		kfree(root);
 	}
-- 
cgit v1.2.3


From 032dbb3b503a30fce732ec4c05525d0abed1f1d6 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 13 Sep 2010 11:16:34 +0900
Subject: nilfs2: see state of root dentry for mount check of snapshots

After applied the patch that unified sb instances, root dentry of
snapshots can be left in dcache even after their trees are unmounted.

The orphan root dentry/inode keeps a root object, and this causes
false positive of nilfs_checkpoint_is_mounted function.

This resolves the issue by having nilfs_checkpoint_is_mounted test
whether the root dentry is busy or not.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/cpfile.c    | 23 ++++++++---------------
 fs/nilfs2/inode.c     | 10 ++++++++++
 fs/nilfs2/nilfs.h     |  3 +++
 fs/nilfs2/super.c     | 32 ++++++++++++++++++++++++++++++++
 fs/nilfs2/the_nilfs.c | 21 ---------------------
 fs/nilfs2/the_nilfs.h |  1 -
 6 files changed, 53 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 03de1da8795b..5ff15a8a1024 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
  */
 int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 {
-	struct the_nilfs *nilfs;
 	int ret;
 
-	nilfs = NILFS_MDT(cpfile)->mi_nilfs;
-
 	switch (mode) {
 	case NILFS_CHECKPOINT:
-		/*
-		 * Check for protecting existing snapshot mounts:
-		 * ns_mount_mutex is used to make this operation atomic and
-		 * exclusive with a new mount job.  Though it doesn't cover
-		 * umount, it's enough for the purpose.
-		 */
-		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
-			/* Current implementation does not have to protect
-			   plain read-only mounts since they are exclusive
-			   with a read/write mount and are protected from the
-			   cleaner. */
+		if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
+			/*
+			 * Current implementation does not have to protect
+			 * plain read-only mounts since they are exclusive
+			 * with a read/write mount and are protected from the
+			 * cleaner.
+			 */
 			ret = -EBUSY;
-		} else
+		else
 			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
 		return ret;
 	case NILFS_SNAPSHOT:
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5b3d43fb4e12..71d4bc8464e0 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -506,6 +506,16 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
 	return 0;
 }
 
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return ilookup5(sb, ino, nilfs_iget_test, &args);
+}
+
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
 				unsigned long ino)
 {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 2ca2ca5ca848..f6e276eaaf6f 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -244,6 +244,8 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino);
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
 				unsigned long ino);
 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
@@ -285,6 +287,7 @@ extern int nilfs_commit_super(struct nilfs_sb_info *, int);
 extern int nilfs_cleanup_super(struct nilfs_sb_info *);
 int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 			    struct nilfs_root **root);
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
 
 /* gcinode.c */
 int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 39e7d7f8eda0..ab96d26bf7e9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -833,6 +833,38 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
 	return nilfs_tree_was_touched(root_dentry);
 }
 
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
+{
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct nilfs_root *root;
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret;
+
+	if (cno < 0 || cno > nilfs->ns_cno)
+		return false;
+
+	if (cno >= nilfs_last_cno(nilfs))
+		return true;	/* protect recent checkpoints */
+
+	ret = false;
+	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+	if (root) {
+		inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
+		if (inode) {
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				if (nilfs_tree_was_touched(dentry))
+					ret = nilfs_try_to_shrink_tree(dentry);
+				dput(dentry);
+			}
+			iput(inode);
+		}
+		nilfs_put_root(root);
+	}
+	return ret;
+}
+
 /**
  * nilfs_fill_super() - initialize a super block instance
  * @sb: super_block
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 4d6763e28eb5..4cc705a1d135 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -769,24 +769,3 @@ void nilfs_put_root(struct nilfs_root *root)
 		kfree(root);
 	}
 }
-
-int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
-				int snapshot_mount)
-{
-	struct nilfs_root *root;
-	int ret;
-
-	if (cno < 0 || cno > nilfs->ns_cno)
-		return false;
-
-	if (cno >= nilfs_last_cno(nilfs))
-		return true;	/* protect recent checkpoints */
-
-	ret = false;
-	root = nilfs_lookup_root(nilfs, cno);
-	if (root) {
-		ret = true;
-		nilfs_put_root(root);
-	}
-	return ret;
-}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index a5178dc43dfd..cae56f338b64 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -242,7 +242,6 @@ struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
 					     __u64 cno);
 void nilfs_put_root(struct nilfs_root *root);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
-int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
-- 
cgit v1.2.3


From c6e071884aca360a14c21757d760e76ec34b4894 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 5 Sep 2010 00:23:50 +0900
Subject: nilfs2: get rid of mi_nilfs back pointer to nilfs object

This removes a back pointer to nilfs object from nilfs_mdt_info
structure that is attached to metadata files.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c    | 9 +++------
 fs/nilfs2/mdt.h    | 8 ++------
 fs/nilfs2/sufile.c | 4 ++--
 fs/nilfs2/sufile.h | 2 +-
 4 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 44326cfe1fa9..32695f3c4b9a 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -78,7 +78,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 						     struct buffer_head *,
 						     void *))
 {
-	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
+	struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_transaction_info ti;
 	struct buffer_head *bh;
@@ -167,9 +167,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		unlock_buffer(bh);
 		goto failed_bh;
 	}
-	bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-	bh->b_blocknr = (sector_t)blknum;
-	set_buffer_mapped(bh);
+	map_bh(bh, inode->i_sb, (sector_t)blknum);
 
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
@@ -412,7 +410,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 		return 0;
 
 	sb = inode->i_sb;
-	nilfs = NILFS_MDT(inode)->mi_nilfs;
+	nilfs = NILFS_SB(sb)->s_nilfs;
 
 	if (!sb) {
 		down_read(&nilfs->ns_writer_sem);
@@ -452,7 +450,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 	if (!mi)
 		return -ENOMEM;
 
-	mi->mi_nilfs = NILFS_I_NILFS(inode);
 	init_rwsem(&mi->mi_sem);
 	inode->i_private = mi;
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 73ff7c055715..76356d2899ed 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -37,7 +37,6 @@ struct nilfs_shadow_map {
 
 /**
  * struct nilfs_mdt_info - on-memory private data of meta data files
- * @mi_nilfs: back pointer to the_nilfs struct
  * @mi_sem: reader/writer semaphore for meta data operations
  * @mi_bgl: per-blockgroup locking
  * @mi_entry_size: size of an entry
@@ -49,7 +48,6 @@ struct nilfs_shadow_map {
  * @mi_blocks_per_desc_block: number of blocks per descriptor block
  */
 struct nilfs_mdt_info {
-	struct the_nilfs       *mi_nilfs;
 	struct rw_semaphore	mi_sem;
 	struct blockgroup_lock *mi_bgl;
 	unsigned		mi_entry_size;
@@ -68,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
-
-	return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
+	return NILFS_SB(inode->i_sb)->s_nilfs;
 }
 
 /* Default GFP flags using highmem */
@@ -117,7 +113,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
 
 static inline __u64 nilfs_mdt_cno(struct inode *inode)
 {
-	return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+	return NILFS_I_NILFS(inode)->ns_cno;
 }
 
 #define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 599d9c27761e..1d6f488ccae8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
 	struct buffer_head *header_bh;
 	struct nilfs_sufile_header *header;
-	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
 	void *kaddr;
 	int ret;
 
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 	struct nilfs_segment_usage *su;
 	struct nilfs_suinfo *si = buf;
 	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
-	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
 	void *kaddr;
 	unsigned long nsegs, segusages_per_block;
 	ssize_t n;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 203f6102a62f..a943fbacb45b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
 
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
-	return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+	return NILFS_I_NILFS(sufile)->ns_nsegments;
 }
 
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
-- 
cgit v1.2.3


From 090fd5b10165033d7c30afde0a7e59141d820602 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 5 Sep 2010 16:17:35 +0900
Subject: nilfs2: get rid of back pointer to writable sb instance

Nilfs object holds a back pointer to a writable super block instance
in nilfs->ns_writer, and this became eliminable since sb is now made
per device and all inodes have a valid pointer to it.

This deletes the ns_writer pointer and a reader/writer semaphore
protecting it.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c       | 33 ++-------------------------------
 fs/nilfs2/recovery.c  |  4 ----
 fs/nilfs2/segment.c   |  4 ----
 fs/nilfs2/the_nilfs.c |  1 -
 fs/nilfs2/the_nilfs.h | 21 ---------------------
 5 files changed, 2 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 32695f3c4b9a..0a2ccfc0d6f9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -78,25 +78,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 						     struct buffer_head *,
 						     void *))
 {
-	struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_transaction_info ti;
 	struct buffer_head *bh;
 	int err;
 
-	if (!sb) {
-		/*
-		 * Make sure this function is not called from any
-		 * read-only context.
-		 */
-		if (!nilfs->ns_writer) {
-			WARN_ON(1);
-			err = -EROFS;
-			goto out;
-		}
-		sb = nilfs->ns_writer->s_super;
-	}
-
 	nilfs_transaction_begin(sb, &ti, 0);
 
 	err = -ENOMEM;
@@ -112,7 +98,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 	if (buffer_uptodate(bh))
 		goto failed_bh;
 
-	bh->b_bdev = nilfs->ns_bdev;
+	bh->b_bdev = sb->s_bdev;
 	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
 	if (likely(!err)) {
 		get_bh(bh);
@@ -129,7 +115,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 		err = nilfs_transaction_commit(sb);
 	else
 		nilfs_transaction_abort(sb);
- out:
+
 	return err;
 }
 
@@ -398,8 +384,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode;
 	struct super_block *sb;
-	struct the_nilfs *nilfs;
-	struct nilfs_sb_info *writer = NULL;
 	int err = 0;
 
 	redirty_page_for_writepage(wbc, page);
@@ -410,25 +394,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 		return 0;
 
 	sb = inode->i_sb;
-	nilfs = NILFS_SB(sb)->s_nilfs;
-
-	if (!sb) {
-		down_read(&nilfs->ns_writer_sem);
-		writer = nilfs->ns_writer;
-		if (!writer) {
-			up_read(&nilfs->ns_writer_sem);
-			return -EROFS;
-		}
-		sb = writer->s_super;
-	}
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		err = nilfs_construct_segment(sb);
 	else if (wbc->for_reclaim)
 		nilfs_flush_segment(sb, inode->i_ino);
 
-	if (writer)
-		up_read(&nilfs->ns_writer_sem);
 	return err;
 }
 
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index dcb5a9812c6c..5d2711c28da7 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -440,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 	segnum[2] = ri->ri_segnum;
 	segnum[3] = ri->ri_nextnum;
 
-	nilfs_attach_writer(nilfs, sbi);
 	/*
 	 * Releasing the next segment of the latest super root.
 	 * The next segment is invalidated by this recovery.
@@ -480,7 +479,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 
  failed:
 	/* No need to recover sufile because it will be destroyed on error */
-	nilfs_detach_writer(nilfs, sbi);
 	return err;
 }
 
@@ -599,7 +597,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 	};
 	int state = RF_INIT_ST;
 
-	nilfs_attach_writer(nilfs, sbi);
 	pseg_start = ri->ri_lsegs_start;
 	seg_seq = ri->ri_lsegs_start_seq;
 	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
@@ -690,7 +687,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
  out:
 	brelse(bh_sum);
 	dispose_recovery_list(&dsync_blocks);
-	nilfs_detach_writer(nilfs, sbi);
 	return err;
 
  confused:
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b0c5e08d06c8..56350bf1f79f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2799,7 +2799,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
 				     struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
 
 	if (NILFS_SC(sbi)) {
@@ -2815,10 +2814,8 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
 	if (!sbi->s_sc_info)
 		return -ENOMEM;
 
-	nilfs_attach_writer(nilfs, sbi);
 	err = nilfs_segctor_start_thread(NILFS_SC(sbi));
 	if (err) {
-		nilfs_detach_writer(nilfs, sbi);
 		kfree(sbi->s_sc_info);
 		sbi->s_sc_info = NULL;
 	}
@@ -2855,5 +2852,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
 	up_write(&nilfs->ns_segctor_sem);
 
 	nilfs_dispose_list(sbi, &garbage_list, 1);
-	nilfs_detach_writer(nilfs, sbi);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 4cc705a1d135..a94aa57c4bd9 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -75,7 +75,6 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	nilfs->ns_bdev = bdev;
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
-	init_rwsem(&nilfs->ns_writer_sem);
 	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_cptree = RB_ROOT;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index cae56f338b64..bbbc1c748aac 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -48,9 +48,7 @@ enum {
  * @ns_flags: flags
  * @ns_bdev: block device
  * @ns_bdi: backing dev info
- * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
- * @ns_writer_sem: semaphore protecting ns_writer attach/detach
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
  * @ns_sbwtime: previous write time of super block
@@ -93,9 +91,7 @@ struct the_nilfs {
 
 	struct block_device    *ns_bdev;
 	struct backing_dev_info *ns_bdi;
-	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
-	struct rw_semaphore	ns_writer_sem;
 
 	/*
 	 * used for
@@ -252,23 +248,6 @@ static inline void nilfs_get_root(struct nilfs_root *root)
 	atomic_inc(&root->count);
 }
 
-static inline void
-nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-	down_write(&nilfs->ns_writer_sem);
-	nilfs->ns_writer = sbi;
-	up_write(&nilfs->ns_writer_sem);
-}
-
-static inline void
-nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-	down_write(&nilfs->ns_writer_sem);
-	if (sbi == nilfs->ns_writer)
-		nilfs->ns_writer = NULL;
-	up_write(&nilfs->ns_writer_sem);
-}
-
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
 {
 	unsigned valid_fs;
-- 
cgit v1.2.3


From 2879ed66e4c6da1dfc6bb0bd04566b61824f9256 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 5 Sep 2010 13:35:53 +0900
Subject: nilfs2: remove own inode allocator and destructor for metadata files

This finally removes own inode allocator and destructor functions for
metadata files.  Several routines, nilfs_mdt_new(),
nilfs_mdt_new_common(), nilfs_mdt_clear(), nilfs_mdt_destroy(), and
nilfs_alloc_inode_common() will be gone.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c   | 111 ------------------------------------------------------
 fs/nilfs2/mdt.h   |   5 ---
 fs/nilfs2/nilfs.h |   1 -
 fs/nilfs2/super.c |   9 +----
 4 files changed, 2 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 0a2ccfc0d6f9..d60fdb097d52 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -36,7 +36,6 @@
 
 #define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
 
-#define INIT_UNUSED_INODE_FIELDS
 
 static int
 nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -435,93 +434,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 	return 0;
 }
 
-/*
- * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
- * ifile, or gcinodes.  This allows the B-tree code and segment constructor
- * to treat them like regular files, and this helps to simplify the
- * implementation.
- *   On the other hand, some of the pseudo inodes have an irregular point:
- * They don't have valid inode->i_sb pointer because their lifetimes are
- * longer than those of the super block structs; they may continue for
- * several consecutive mounts/umounts.  This would need discussions.
- */
-/**
- * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
- * @nilfs: nilfs object
- * @sb: super block instance the metadata file belongs to
- * @ino: inode number
- */
-struct inode *
-nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-		     ino_t ino)
-{
-	struct inode *inode = nilfs_alloc_inode_common(nilfs);
-
-	if (!inode)
-		return NULL;
-	else {
-		struct address_space * const mapping = &inode->i_data;
-
-		inode->i_sb = sb; /* sb may be NULL for some meta data files */
-		inode->i_blkbits = nilfs->ns_blocksize_bits;
-		inode->i_flags = 0;
-		atomic_set(&inode->i_count, 1);
-		inode->i_nlink = 1;
-		inode->i_ino = ino;
-
-#ifdef INIT_UNUSED_INODE_FIELDS
-		atomic_set(&inode->i_writecount, 0);
-		inode->i_size = 0;
-		inode->i_blocks = 0;
-		inode->i_bytes = 0;
-		inode->i_generation = 0;
-#ifdef CONFIG_QUOTA
-		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
-#endif
-		inode->i_pipe = NULL;
-		inode->i_bdev = NULL;
-		inode->i_cdev = NULL;
-		inode->i_rdev = 0;
-#ifdef CONFIG_SECURITY
-		inode->i_security = NULL;
-#endif
-		inode->dirtied_when = 0;
-
-		INIT_LIST_HEAD(&inode->i_list);
-		INIT_LIST_HEAD(&inode->i_sb_list);
-		inode->i_state = 0;
-#endif
-
-		spin_lock_init(&inode->i_lock);
-		mutex_init(&inode->i_mutex);
-		init_rwsem(&inode->i_alloc_sem);
-
-		mapping->host = NULL;  /* instead of inode */
-		mapping->flags = 0;
-		mapping->assoc_mapping = NULL;
-
-		inode->i_mapping = mapping;
-	}
-
-	return inode;
-}
-
-struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-			    ino_t ino, size_t objsz)
-{
-	struct inode *inode;
-
-	inode = nilfs_mdt_new_common(nilfs, sb, ino);
-	if (!inode)
-		return NULL;
-
-	if (nilfs_mdt_init(inode, NILFS_MDT_GFP, objsz) < 0) {
-		nilfs_destroy_inode(inode);
-		return NULL;
-	}
-	return inode;
-}
-
 void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
 			      unsigned header_size)
 {
@@ -688,26 +600,3 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode)
 	truncate_inode_pages(&shadow->frozen_btnodes, 0);
 	up_write(&mi->mi_sem);
 }
-
-static void nilfs_mdt_clear(struct inode *inode)
-{
-	struct nilfs_inode_info *ii = NILFS_I(inode);
-
-	invalidate_mapping_pages(inode->i_mapping, 0, -1);
-	truncate_inode_pages(inode->i_mapping, 0);
-
-	if (test_bit(NILFS_I_BMAP, &ii->i_state))
-		nilfs_bmap_clear(ii->i_bmap);
-	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
-}
-
-void nilfs_mdt_destroy(struct inode *inode)
-{
-	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
-
-	if (mdi->mi_palloc_cache)
-		nilfs_palloc_destroy_cache(inode);
-	nilfs_mdt_clear(inode);
-
-	nilfs_destroy_inode(inode);
-}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 76356d2899ed..b13734bf3521 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -82,11 +82,6 @@ int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
 int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
-			    size_t);
-struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-				   ino_t);
-void nilfs_mdt_destroy(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 
 int nilfs_mdt_setup_shadow_map(struct inode *inode,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f6e276eaaf6f..f7560da5a567 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -266,7 +266,6 @@ extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
 
 /* super.c */
-extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ab96d26bf7e9..d92ebd5d60d9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -146,7 +146,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 }
 
 
-struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
+struct inode *nilfs_alloc_inode(struct super_block *sb)
 {
 	struct nilfs_inode_info *ii;
 
@@ -157,15 +157,10 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 	ii->i_state = 0;
 	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
+	nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
 	return &ii->vfs_inode;
 }
 
-struct inode *nilfs_alloc_inode(struct super_block *sb)
-{
-	return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
-}
-
 void nilfs_destroy_inode(struct inode *inode)
 {
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
-- 
cgit v1.2.3


From c05dbfc2609993ccc067879579e2a7726e12b3f1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 16 Sep 2010 00:36:24 +0900
Subject: nilfs2: accept 64-bit checkpoint numbers in cp mount option

The current implementation doesn't mount snapshots with checkpoint
numbers larger than INT_MAX since it uses match_int() for parsing
"cp=" mount option.

This uses simple_strtoull() for the conversion to resolve the issue.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index d92ebd5d60d9..a1cd444103ff 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -556,7 +556,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
-	int option;
 
 	if (!options)
 		return 1;
@@ -594,8 +593,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
 			break;
 		case Opt_snapshot:
-			if (match_int(&args[0], &option) || option <= 0)
-				return 0;
 			if (is_remount) {
 				printk(KERN_ERR
 				       "NILFS: \"%s\" option is invalid "
@@ -1065,7 +1062,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 {
 	char *p, *options = data;
 	substring_t args[MAX_OPT_ARGS];
-	int option, token;
+	int token;
 	int ret = 0;
 
 	do {
@@ -1073,16 +1070,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 		if (p != NULL && *p) {
 			token = match_token(p, tokens, args);
 			if (token == Opt_snapshot) {
-				if (!(sd->flags & MS_RDONLY))
+				if (!(sd->flags & MS_RDONLY)) {
 					ret++;
-				else {
-					ret = match_int(&args[0], &option);
-					if (!ret) {
-						if (option > 0)
-							sd->cno = option;
-						else
-							ret++;
-					}
+				} else {
+					sd->cno = simple_strtoull(args[0].from,
+								  NULL, 0);
+					/*
+					 * No need to see the end pointer;
+					 * match_token() has done syntax
+					 * checking.
+					 */
+					if (sd->cno == 0)
+						ret++;
 				}
 			}
 			if (ret)
-- 
cgit v1.2.3


From 5beb6e0b2008386571fd342d0a4a14f5c8c0baf8 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 20 Sep 2010 18:19:06 +0900
Subject: nilfs2: add bdev freeze/thaw support

Nilfs hasn't supported the freeze/thaw feature because it didn't work
due to the peculiar design that multiple super block instances could
be allocated for a device.  This limitation was removed by the patch
"nilfs2: do not allocate multiple super block instances for a device".

So now this adds the freeze/thaw support to nilfs.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c   |  2 ++
 fs/nilfs2/segment.c |  2 ++
 fs/nilfs2/super.c   | 58 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 58 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 338858fc907c..3e90f86d5bfe 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -580,6 +580,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		goto out_free;
 	}
 
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
 	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
 	if (ret < 0)
 		printk(KERN_ERR "NILFS: GC failed during preparation: "
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 56350bf1f79f..172ad4257494 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
 	if (ret > 0)
 		return 0;
 
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+
 	sbi = NILFS_SB(sb);
 	nilfs = sbi->s_nilfs;
 	down_read(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index a1cd444103ff..92e8c769584c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -73,6 +73,7 @@ struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
 
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
 static void nilfs_set_error(struct nilfs_sb_info *sbi)
@@ -439,6 +440,36 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 	return err;
 }
 
+static int nilfs_freeze(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/* Mark super block clean */
+	down_write(&nilfs->ns_sem);
+	err = nilfs_cleanup_super(sbi);
+	up_write(&nilfs->ns_sem);
+	return err;
+}
+
+static int nilfs_unfreeze(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_setup_super(sbi, false);
+	up_write(&nilfs->ns_sem);
+	return 0;
+}
+
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -523,6 +554,8 @@ static const struct super_operations nilfs_sops = {
 	.put_super      = nilfs_put_super,
 	/* .write_super    = nilfs_write_super, */
 	.sync_fs        = nilfs_sync_fs,
+	.freeze_fs	= nilfs_freeze,
+	.unfreeze_fs	= nilfs_unfreeze,
 	/* .write_super_lockfs */
 	/* .unlockfs */
 	.statfs         = nilfs_statfs,
@@ -626,7 +659,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
 		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 
-static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_super_block **sbp;
@@ -638,6 +671,9 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 	if (!sbp)
 		return -EIO;
 
+	if (!is_mount)
+		goto skip_mount_setup;
+
 	max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
 	mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
 
@@ -654,9 +690,11 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 		sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
 
 	sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
+	sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+
+skip_mount_setup:
 	sbp[0]->s_state =
 		cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
-	sbp[0]->s_mtime = cpu_to_le64(get_seconds());
 	/* synchronize sbp[1] with sbp[0] */
 	memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
 	return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
@@ -938,7 +976,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
-		nilfs_setup_super(sbi);
+		nilfs_setup_super(sbi, true);
 		up_write(&nilfs->ns_sem);
 	}
 
@@ -1034,7 +1072,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore_opts;
 
 		down_write(&nilfs->ns_sem);
-		nilfs_setup_super(sbi);
+		nilfs_setup_super(sbi, true);
 		up_write(&nilfs->ns_sem);
 	}
  out:
@@ -1132,7 +1170,19 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
+	if (sd.bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+		err = -EBUSY;
+		goto failed;
+	}
 	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
+	mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
 		goto failed;
-- 
cgit v1.2.3


From 026a7d63d55ba8656ed8c8a0733265cc7d47bb8c Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 7 Oct 2010 14:19:48 +0900
Subject: nilfs2: get rid of bdi from nilfs object

Nilfs now can use sb->s_bdi to get backing_dev_info, so we use it
instead of ns_bdi on the nilfs object and remove ns_bdi.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c       | 2 +-
 fs/nilfs2/segbuf.c    | 3 ++-
 fs/nilfs2/super.c     | 5 ++++-
 fs/nilfs2/the_nilfs.c | 4 ----
 fs/nilfs2/the_nilfs.h | 2 --
 5 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d60fdb097d52..39a5b84e2c9f 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -457,7 +457,7 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-	struct backing_dev_info *bdi = NILFS_I_NILFS(inode)->ns_bdi;
+	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	nilfs_mapping_init_once(&shadow->frozen_data);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4588fb9e93df..0f83e93935b2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 	struct bio *bio = wi->bio;
 	int err;
 
-	if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
+	if (segbuf->sb_nbio > 0 &&
+	    bdi_write_congested(segbuf->sb_super->s_bdi)) {
 		wait_for_completion(&segbuf->sb_bio_event);
 		segbuf->sb_nbio--;
 		if (unlikely(atomic_read(&segbuf->sb_err))) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 92e8c769584c..8e77016bafae 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -910,6 +910,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct the_nilfs *nilfs;
 	struct nilfs_sb_info *sbi;
 	struct nilfs_root *fsroot;
+	struct backing_dev_info *bdi;
 	__u64 cno;
 	int err;
 
@@ -948,7 +949,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_export_op = &nilfs_export_ops;
 	sb->s_root = NULL;
 	sb->s_time_gran = 1;
-	sb->s_bdi = nilfs->ns_bdi;
+
+	bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	sb->s_bdi = bdi ? : &default_backing_dev_info;
 
 	err = load_nilfs(nilfs, sbi);
 	if (err)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a94aa57c4bd9..bd02b6127d35 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -535,7 +535,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 {
 	struct super_block *sb = sbi->s_super;
 	struct nilfs_super_block *sbp;
-	struct backing_dev_info *bdi;
 	int blocksize;
 	int err;
 
@@ -598,9 +597,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
 
-	bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
-	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
-
 	err = nilfs_store_log_cursor(nilfs, sbp);
 	if (err)
 		goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index bbbc1c748aac..69226e14b745 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -47,7 +47,6 @@ enum {
  * struct the_nilfs - struct to supervise multiple nilfs mount points
  * @ns_flags: flags
  * @ns_bdev: block device
- * @ns_bdi: backing dev info
  * @ns_sem: semaphore for shared states
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
@@ -90,7 +89,6 @@ struct the_nilfs {
 	unsigned long		ns_flags;
 
 	struct block_device    *ns_bdev;
-	struct backing_dev_info *ns_bdi;
 	struct rw_semaphore	ns_sem;
 
 	/*
-- 
cgit v1.2.3


From abc0b50b6b9a9de8ae210f059598265a5438f2c4 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Fri, 8 Oct 2010 22:37:27 +0900
Subject: nilfs2: eliminate sparse warnings - "symbol not declared"

change nilfs_dat_commit_free and nilfs_inode_cachep static
to fix following warnings

fs/nilfs2/super.c:72:19: warning: symbol 'nilfs_inode_cachep' was not declared. Should it be static?
fs/nilfs2/dat.c:106:6: warning: symbol 'nilfs_dat_commit_free' was not declared. Should it be static?

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dat.c   | 3 ++-
 fs/nilfs2/super.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index ab04a68f425d..49c844dab33a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -103,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_palloc_abort_alloc_entry(dat, req);
 }
 
-void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+static void nilfs_dat_commit_free(struct inode *dat,
+				  struct nilfs_palloc_req *req)
 {
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8e77016bafae..ab629078f4e2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -68,7 +68,7 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
-struct kmem_cache *nilfs_inode_cachep;
+static struct kmem_cache *nilfs_inode_cachep;
 struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
-- 
cgit v1.2.3


From 6b81e14e645016597c81e71cd27ee5c57c3a3c36 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 14 Oct 2010 13:52:00 +0900
Subject: nilfs2: eliminate sparse warning - "context imbalance"

insert sparse annotations to fix following sparse warning.

fs/nilfs2/segment.c:2681:3: warning: context imbalance in 'nilfs_segctor_kill_thread' - unexpected unlock

nilfs_segctor_kill_thread is only called inside sc_state_lock lock.
sparse doesn't detect the context and warn "unexpected unlock".
__acquires/__releases pretend to lock/unlock the sc_state_lock for sparse.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 172ad4257494..d926af626177 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2674,6 +2674,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
 }
 
 static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+	__acquires(&sci->sc_state_lock)
+	__releases(&sci->sc_state_lock)
 {
 	sci->sc_state |= NILFS_SEGCTOR_QUIT;
 
-- 
cgit v1.2.3


From f8cae0f03f75adb54b1d48ddbc90f84a1f5de186 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 22 Oct 2010 19:30:38 -0700
Subject: ocfs2: drop the BLKDEV_IFL_WAIT flag

Commit dd3932eddf42 ("block: remove BLKDEV_IFL_WAIT") had removed the
flag argument to blkdev_issue_flush(), but the ocfs2 merge brought in a
new one.  It didn't cause a merge conflict, so the merges silently
worked out fine, but the result didn't actually compile.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9e8cc4346b76..1ca6867935bb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,8 +187,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 		 * platter
 		 */
 		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					   NULL, BLKDEV_IFL_WAIT);
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 		goto bail;
 	}
 
-- 
cgit v1.2.3


From 6c2754c28f2388a276fe21edde826f2113c8f60e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 23 Oct 2010 08:14:12 -0700
Subject: Revert "tty: Add a new file /proc/tty/consoles"

This reverts commit f4a3e0bceb57466c31757f25e4e0ed108d1299ec.  Jiri
Sladby points out that the tty structure we're using may already be
gone, and Al Viro doesn't hold back in complaining about the random
loading of 'filp->private_data' which doesn't have to be a pointer at
all, nor does checking the magic field for TTY_MAGIC prove anything.

Belated review by Al:

 "a) global variable depending on stdin of the last opener? Affecting
     output of read(2)? Really?

  b) iterator is broken; list should be locked in ->start(), unlocked in
     ->stop() and *NOT* unlocked/relocked in ->next()

  c) ->show() ought to do nothing in case of ->device == NULL, instead
     of skipping those in ->next()/->start()

  d) regardless of the merits of the bright idea about asterisk at that
     line in output *and* regardless of (a), the implementation is not
     only atrociously ugly, it's actually very likely to be a roothole.
     Verifying that Cthulhu knows what number happens to be address of a
     tty_struct by blindly dereferencing memory at that address...
     Ouch.

  Please revert that crap."

And Christoph pipes in and NAK's the approach of walking fd tables etc
too.  So it's pretty unanimous.

Noticed-by: Jri Slaby <jslaby@suse.cz>
Requested-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Werner Fink <werner@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  32 --------
 fs/proc/proc_tty.c                 | 158 -------------------------------------
 2 files changed, 190 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 98223a676940..a6aca8740883 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1075,7 +1075,6 @@ Table 1-11: Files in /proc/tty
  drivers       list of drivers and their usage                
  ldiscs        registered line disciplines                    
  driver/serial usage statistic and status of single tty lines 
- consoles      registered system console lines
 ..............................................................................
 
 To see  which  tty's  are  currently in use, you can simply look into the file
@@ -1094,37 +1093,6 @@ To see  which  tty's  are  currently in use, you can simply look into the file
   /dev/tty             /dev/tty        5       0 system:/dev/tty 
   unknown              /dev/tty        4    1-63 console 
 
-To see which character device lines are currently used for the system console
-/dev/console, you may simply look into the file /proc/tty/consoles:
-
-  > cat /proc/tty/consoles
-  tty0                 -WU (ECp)       4:7
-  ttyS0                -W- (Ep)        4:64
-
-The columns are:
-
-  device               name of the device
-  operations           R = can do read operations
-                       W = can do write operations
-                       U = can do unblank
-  flags                E = it is enabled
-                       C = it is prefered console
-                       B = it is primary boot console
-                       p = it is used for printk buffer
-                       b = it is not a TTY but a Braille device
-                       a = it is safe to use when cpu is offline
-                       * = it is standard input of the reading process
-  major:minor          major and minor number of the device separated by a colon
-
-If the reading process holds /dev/console open at the regular standard input
-stream the active device will be marked by an asterisk:
-
-  > cat /proc/tty/consoles < /dev/console
-  tty0                 -WU (ECp*)      4:7
-  ttyS0                -W- (Ep)        4:64
-  > tty
-  /dev/pts/3
-
 
 1.8 Miscellaneous kernel statistics in /proc/stat
 -------------------------------------------------
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index dc44f94022f1..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -12,10 +12,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/console.h>
 #include <linux/seq_file.h>
-#include <linux/fdtable.h>
 #include <linux/bitops.h>
 
 /*
@@ -139,160 +136,6 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
-/*
- * The device ID of file descriptor 0 of the current reading
- * task if a character device...
- */
-static dev_t current_dev;
-
-/*
- * This is the handler for /proc/tty/consoles
- */
-static int show_console_dev(struct seq_file *m, void *v)
-{
-	const struct tty_driver *driver;
-	struct console *con;
-	int index, len;
-	char flags[10];
-	dev_t dev;
-
-	if (v == SEQ_START_TOKEN)
-		return 0;
-	con = (struct console *)v;
-	if (!con)
-		return 0;
-	driver = con->device(con, &index);
-	if (!driver)
-		return 0;
-	dev = MKDEV(driver->major, driver->minor_start) + index;
-
-	index = 0;
-	if (con->flags & CON_ENABLED)
-		flags[index++] = 'E';
-	if (con->flags & CON_CONSDEV)
-		flags[index++] = 'C';
-	if (con->flags & CON_BOOT)
-		flags[index++] = 'B';
-	if (con->flags & CON_PRINTBUFFER)
-		flags[index++] = 'p';
-	if (con->flags & CON_BRL)
-		flags[index++] = 'b';
-	if (con->flags & CON_ANYTIME)
-		flags[index++] = 'a';
-	if (current_dev == dev)
-		flags[index++] = '*';
-	flags[index] = 0;
-
-	seq_printf(m, "%s%d%n", con->name, con->index, &len);
-	len = 21 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c", len, ' ');
-	seq_printf(m, "%c%c%c (%s)%n", con->read ? 'R' : '-',
-			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
-			flags, &len);
-	len = 13 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c%4d:%d\n", len, ' ', MAJOR(dev), MINOR(dev));
-
-	return 0;
-}
-
-/* iterator for consoles */
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-	struct console *con;
-	loff_t off = 0;
-
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
-
-	acquire_console_sem();
-	for (con = console_drivers; con; con = con->next) {
-		if (!con->device)
-			continue;
-		if (++off == *pos)
-			break;
-	}
-	release_console_sem();
-
-	return con;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct console *con;
-
-	acquire_console_sem();
-	if (v == SEQ_START_TOKEN)
-		con = console_drivers;
-	else
-		con = ((struct console *)v)->next;
-	for (; con; con = con->next) {
-		if (!con->device)
-			continue;
-		++*pos;
-		break;
-	}
-	release_console_sem();
-
-	return con;
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-static const struct seq_operations tty_consoles_op = {
-	.start	= c_start,
-	.next	= c_next,
-	.stop	= c_stop,
-	.show	= show_console_dev
-};
-
-/*
- * Used for open /proc/tty/consoles. Before this detect
- * the device ID of file descriptor 0 of the current
- * reading task if a character device...
- */
-static int tty_consoles_open(struct inode *inode, struct file *file)
-{
-	struct files_struct *curfiles;
-
-	current_dev = 0;
-	curfiles = get_files_struct(current);
-	if (curfiles) {
-		const struct file *curfp;
-		spin_lock(&curfiles->file_lock);
-		curfp = fcheck_files(curfiles, 0);
-		if (curfp && curfp->private_data) {
-			const struct inode *inode;
-			dget(curfp->f_dentry);
-			inode = curfp->f_dentry->d_inode;
-			if (S_ISCHR(inode->i_mode)) {
-				struct tty_struct *tty;
-				tty = (struct tty_struct *)curfp->private_data;
-				if (tty && tty->magic == TTY_MAGIC) {
-					tty = tty_pair_get_tty(tty);
-					current_dev = tty_devnum(tty);
-				}
-			}
-			dput(curfp->f_dentry);
-		}
-		spin_unlock(&curfiles->file_lock);
-		put_files_struct(curfiles);
-	}
-	return seq_open(file, &tty_consoles_op);
-}
-
-static const struct file_operations proc_tty_consoles_operations = {
-	.open		= tty_consoles_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -343,5 +186,4 @@ void __init proc_tty_init(void)
 	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
 	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
 	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
-	proc_create("tty/consoles", 0, NULL, &proc_tty_consoles_operations);
 }
-- 
cgit v1.2.3


From 898f635c4297e91ceac675d83c4a460f26118342 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Oct 2010 11:24:25 -0400
Subject: NFSv4: Don't ignore the error return codes from nfs_intent_set_file

If nfs_intent_set_file() returns an error, we usually want to pass that
back up the stack.

Also ensure that nfs_open_revalidate() returns '1' on success.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fe549f5ef20f..a847acf4d3cd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1086,6 +1086,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	struct dentry *res = NULL;
 	struct inode *inode;
 	int open_flags;
+	int err;
 
 	dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1119,9 +1120,8 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		if (!IS_POSIXACL(dir))
 			attr.ia_mode &= ~current_umask();
 	} else {
-		open_flags &= ~O_EXCL;
+		open_flags &= ~(O_EXCL | O_CREAT);
 		attr.ia_valid = 0;
-		BUG_ON(open_flags & O_CREAT);
 	}
 
 	/* Open the file on the server */
@@ -1150,13 +1150,18 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		}
 	}
 	res = d_add_unique(dentry, inode);
+	nfs_unblock_sillyrename(dentry->d_parent);
 	if (res != NULL) {
 		dput(ctx->path.dentry);
 		ctx->path.dentry = dget(res);
 		dentry = res;
 	}
-	nfs_intent_set_file(nd, ctx);
-	nfs_unblock_sillyrename(dentry->d_parent);
+	err = nfs_intent_set_file(nd, ctx);
+	if (err < 0) {
+		if (res != NULL)
+			dput(res);
+		return ERR_PTR(err);
+	}
 out:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	return res;
@@ -1221,11 +1226,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 		}
 	}
 	iput(inode);
-	if (inode == dentry->d_inode) {
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		nfs_intent_set_file(nd, ctx);
-	} else
+	if (inode != dentry->d_inode)
 		goto out_drop;
+
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	ret = nfs_intent_set_file(nd, ctx);
+	if (ret >= 0)
+		ret = 1;
 out:
 	dput(parent);
 	return ret;
@@ -1262,20 +1269,24 @@ static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
 		ctx = nameidata_to_nfs_open_context(dentry, nd);
 		error = PTR_ERR(ctx);
 		if (IS_ERR(ctx))
-			goto out_err;
+			goto out_err_drop;
 	}
 
 	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
 	if (error != 0)
 		goto out_put_ctx;
-	if (ctx != NULL)
-		nfs_intent_set_file(nd, ctx);
+	if (ctx != NULL) {
+		error = nfs_intent_set_file(nd, ctx);
+		if (error < 0)
+			goto out_err;
+	}
 	return 0;
 out_put_ctx:
 	if (ctx != NULL)
 		put_nfs_open_context(ctx);
-out_err:
+out_err_drop:
 	d_drop(dentry);
+out_err:
 	return error;
 }
 
-- 
cgit v1.2.3


From 168667c43bbafff11b46014a1e94477ff7619f45 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 19 Oct 2010 19:47:49 -0400
Subject: NFSv4: The state manager must ignore EKEYEXPIRED.

Otherwise, we cannot recover state correctly.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c  | 27 ++++++++++++++++++---------
 fs/nfs/nfs4state.c | 23 ++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 88a9d93ec1bd..aa771c1af115 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1186,7 +1186,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	int err;
 	do {
 		err = _nfs4_do_open_reclaim(ctx, state);
-		if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
@@ -1256,6 +1256,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
 				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
 			case -ENOMEM:
 				err = 0;
 				goto out;
@@ -1603,7 +1610,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
 			goto out;
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
-		case -EKEYEXPIRED:
 			nfs4_handle_exception(server, err, &exception);
 			err = 0;
 		}
@@ -3544,7 +3550,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 			case -NFS4ERR_RESOURCE:
 				/* The IBM lawyers misread another document! */
 			case -NFS4ERR_DELAY:
-			case -EKEYEXPIRED:
 				err = nfs4_delay(clp->cl_rpcclient, &timeout);
 		}
 	} while (err == 0);
@@ -4156,7 +4161,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
 		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
 			return 0;
 		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-		if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
@@ -4181,7 +4186,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 			goto out;
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
-		case -EKEYEXPIRED:
 			nfs4_handle_exception(server, err, &exception);
 			err = 0;
 		}
@@ -4327,13 +4331,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
 				err = 0;
 				goto out;
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+				err = 0;
+				goto out;
 			case -ENOMEM:
 			case -NFS4ERR_DENIED:
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				err = 0;
 				goto out;
 			case -NFS4ERR_DELAY:
-			case -EKEYEXPIRED:
 				break;
 		}
 		err = nfs4_handle_exception(server, err, &exception);
@@ -4562,7 +4574,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
 	case -NFS4ERR_GRACE:
-	case -EKEYEXPIRED:
 		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
 		rpc_delay(task, NFS4_POLL_RETRY_MIN);
 		task->tk_status = 0;
@@ -5025,7 +5036,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
 {
 	switch(task->tk_status) {
 	case -NFS4ERR_DELAY:
-	case -EKEYEXPIRED:
 		rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		return -EAGAIN;
 	default:
@@ -5167,7 +5177,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
 	case -NFS4ERR_WRONG_CRED: /* What to do here? */
 		break;
 	case -NFS4ERR_DELAY:
-	case -EKEYEXPIRED:
 		rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		return -EAGAIN;
 	default:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 940cf7c070af..40028ac2b69c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1063,6 +1063,14 @@ restart:
 				/* Mark the file as being 'closed' */
 				state->state = 0;
 				break;
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+				break;
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_BAD_STATEID:
@@ -1181,6 +1189,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
+static void nfs4_warn_keyexpired(const char *s)
+{
+	printk_ratelimited(KERN_WARNING "Error: state manager"
+			" encountered RPCSEC_GSS session"
+			" expired against NFSv4 server %s.\n",
+			s);
+}
+
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
 	switch (error) {
@@ -1210,6 +1226,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 			/* Zero session reset errors */
 			return 0;
+		case -EKEYEXPIRED:
+			/* Nothing we can do */
+			nfs4_warn_keyexpired(clp->cl_hostname);
+			return 0;
 	}
 	return error;
 }
@@ -1420,9 +1440,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
 		case -NFS4ERR_DELAY:
 		case -NFS4ERR_CLID_INUSE:
 		case -EAGAIN:
-		case -EKEYEXPIRED:
 			break;
 
+		case -EKEYEXPIRED:
+			nfs4_warn_keyexpired(clp->cl_hostname);
 		case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
 					 * in nfs4_exchange_id */
 		default:
-- 
cgit v1.2.3


From 8c7597f6ce212bbc8ca05090e21820ffe9792b3d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 22 Oct 2010 16:18:52 -0700
Subject: nfs: include ratelimit.h, fix nfs4state build error

nfs4state.c uses interfaces from ratelimit.h.  It needs to include
that header file to fix build errors:

fs/nfs/nfs4state.c:1195: warning: type defaults to 'int' in declaration of 'DEFINE_RATELIMIT_STATE'
fs/nfs/nfs4state.c:1195: warning: parameter names (without types) in function declaration
fs/nfs/nfs4state.c:1195: error: invalid storage class for function 'DEFINE_RATELIMIT_STATE'
fs/nfs/nfs4state.c:1195: error: implicit declaration of function '__ratelimit'
fs/nfs/nfs4state.c:1195: error: '_rs' undeclared (first use in this function)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc:	Trond Myklebust <Trond.Myklebust@netapp.com>
Cc:	linux-nfs@vger.kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 40028ac2b69c..e0dc06d74b6a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -46,6 +46,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 
-- 
cgit v1.2.3


From d1bacf9eb2fd0e7ef870acf84b9e3b157dcfa7dc Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 24 Sep 2010 14:48:42 -0400
Subject: NFS: add readdir cache array

This patch adds the readdir cache array and functions to retreive the array
stored on a cache page, clear the array by freeing allocated memory, add an
entry to the array, and search the array for a given cookie.

It then modifies readdir to make use of the new cache array.
With the new cache array method, we no longer need some of this code.

Finally, nfs_llseek_dir() will set file->f_pos to a value greater than 0 and
desc->dir_cookie to zero.  When we see this, readdir needs to find the file
at position file->f_pos from the start of the directory.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 632 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 309 insertions(+), 323 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a847acf4d3cd..f77243907a08 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -55,6 +55,7 @@ static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static int nfs_readdir_clear_array(struct page*, gfp_t);
 
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
@@ -80,6 +81,10 @@ const struct inode_operations nfs_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 };
 
+const struct address_space_operations nfs_dir_addr_space_ops = {
+	.releasepage = nfs_readdir_clear_array,
+};
+
 #ifdef CONFIG_NFS_V3
 const struct inode_operations nfs3_dir_inode_operations = {
 	.create		= nfs_create,
@@ -151,51 +156,188 @@ nfs_opendir(struct inode *inode, struct file *filp)
 	return res;
 }
 
+struct nfs_cache_array_entry {
+	u64 cookie;
+	u64 ino;
+	struct qstr string;
+};
+
+struct nfs_cache_array {
+	unsigned int size;
+	int eof_index;
+	u64 last_cookie;
+	struct nfs_cache_array_entry array[0];
+};
+
+#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+
 typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
 	unsigned long	page_index;
-	__be32		*ptr;
 	u64		*dir_cookie;
 	loff_t		current_index;
-	struct nfs_entry *entry;
 	decode_dirent_t	decode;
-	int		plus;
+
 	unsigned long	timestamp;
 	unsigned long	gencount;
-	int		timestamp_valid;
+	unsigned int	cache_entry_index;
+	unsigned int	plus:1;
+	unsigned int	eof:1;
 } nfs_readdir_descriptor_t;
 
-/* Now we cache directories properly, by stuffing the dirent
- * data directly in the page cache.
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- *	 page-in of the RPC reply, nowhere else, this simplies
- *	 things substantially.
+/*
+ * The caller is responsible for calling nfs_readdir_release_array(page)
  */
 static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+	if (page == NULL)
+		return ERR_PTR(-EIO);
+	return (struct nfs_cache_array *)kmap(page);
+}
+
+static
+void nfs_readdir_release_array(struct page *page)
+{
+	kunmap(page);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+{
+	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	int i;
+	for (i = 0; i < array->size; i++)
+		kfree(array->array[i].string.name);
+	nfs_readdir_release_array(page);
+	return 0;
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+void nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+	string->len = len;
+	string->name = kmemdup(name, len, GFP_KERNEL);
+	string->hash = full_name_hash(string->name, string->len);
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	if (IS_ERR(array))
+		return PTR_ERR(array);
+	if (array->size >= MAX_READDIR_ARRAY) {
+		nfs_readdir_release_array(page);
+		return -EIO;
+	}
+
+	array->array[array->size].cookie = entry->prev_cookie;
+	array->last_cookie = entry->cookie;
+	array->array[array->size].ino = entry->ino;
+	nfs_readdir_make_qstr(&array->array[array->size].string, entry->name, entry->len);
+	if (entry->eof == 1)
+		array->eof_index = array->size;
+	array->size++;
+	nfs_readdir_release_array(page);
+	return 0;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	loff_t diff = desc->file->f_pos - desc->current_index;
+	unsigned int index;
+
+	if (diff < 0)
+		goto out_eof;
+	if (diff >= array->size) {
+		if (array->eof_index > 0)
+			goto out_eof;
+		desc->current_index += array->size;
+		return -EAGAIN;
+	}
+
+	index = (unsigned int)diff;
+	*desc->dir_cookie = array->array[index].cookie;
+	desc->cache_entry_index = index;
+	if (index == array->eof_index)
+		desc->eof = 1;
+	return 0;
+out_eof:
+	desc->eof = 1;
+	return -EBADCOOKIE;
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	int i;
+	int status = -EAGAIN;
+
+	for (i = 0; i < array->size; i++) {
+		if (i == array->eof_index) {
+			desc->eof = 1;
+			status = -EBADCOOKIE;
+		}
+		if (array->array[i].cookie == *desc->dir_cookie) {
+			desc->cache_entry_index = i;
+			status = 0;
+			break;
+		}
+	}
+
+	return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+	struct nfs_cache_array *array;
+	int status = -EBADCOOKIE;
+
+	if (desc->dir_cookie == NULL)
+		goto out;
+
+	array = nfs_readdir_get_array(desc->page);
+	if (IS_ERR(array)) {
+		status = PTR_ERR(array);
+		goto out;
+	}
+
+	if (*desc->dir_cookie == 0)
+		status = nfs_readdir_search_for_pos(array, desc);
+	else
+		status = nfs_readdir_search_for_cookie(array, desc);
+
+	nfs_readdir_release_array(desc->page);
+out:
+	return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page *xdr_page, nfs_readdir_descriptor_t *desc,
+			struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
-	struct file	*file = desc->file;
-	struct inode	*inode = file->f_path.dentry->d_inode;
 	struct rpc_cred	*cred = nfs_file_cred(file);
 	unsigned long	timestamp, gencount;
 	int		error;
 
-	dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-			__func__, (long long)desc->entry->cookie,
-			page->index);
-
  again:
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
-	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, xdr_page,
 					  NFS_SERVER(inode)->dtsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
@@ -209,190 +351,157 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
-	desc->timestamp_valid = 1;
-	SetPageUptodate(page);
-	/* Ensure consistent page alignment of the data.
-	 * Note: assumes we have exclusive access to this mapping either
-	 *	 through inode->i_mutex or some other mechanism.
-	 */
-	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-		/* Should never happen */
-		nfs_zap_mapping(inode, inode->i_mapping);
-	}
-	unlock_page(page);
-	return 0;
- error:
-	unlock_page(page);
-	return -EIO;
+error:
+	return error;
 }
 
-static inline
-int dir_decode(nfs_readdir_descriptor_t *desc)
+/* Fill in an entry based on the xdr code stored in desc->page */
+static
+int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, __be32 **ptr)
 {
-	__be32	*p = desc->ptr;
-	p = desc->decode(p, desc->entry, desc->plus);
+	__be32	*p = *ptr;
+	p = desc->decode(p, entry, desc->plus);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
-	desc->ptr = p;
-	if (desc->timestamp_valid) {
-		desc->entry->fattr->time_start = desc->timestamp;
-		desc->entry->fattr->gencount = desc->gencount;
-	} else
-		desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
-	return 0;
-}
+	*ptr = p;
 
-static inline
-void dir_page_release(nfs_readdir_descriptor_t *desc)
-{
-	kunmap(desc->page);
-	page_cache_release(desc->page);
-	desc->page = NULL;
-	desc->ptr = NULL;
+	entry->fattr->time_start = desc->timestamp;
+	entry->fattr->gencount = desc->gencount;
+	return 0;
 }
 
-/*
- * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent(nfs_readdir_descriptor_t *desc)
+/* Perform conversion from xdr to cache array */
+static
+void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+				struct page *xdr_page, struct page *page)
 {
-	struct nfs_entry *entry = desc->entry;
-	int		loop_count = 0,
-			status;
-
-	while((status = dir_decode(desc)) == 0) {
-		dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
-				__func__, (unsigned long long)entry->cookie);
-		if (entry->prev_cookie == *desc->dir_cookie)
+	__be32 *ptr = kmap(xdr_page);
+	while (xdr_decode(desc, entry, &ptr) == 0) {
+		if (nfs_readdir_add_to_array(entry, page) == -1)
 			break;
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
 	}
-	return status;
+	kunmap(xdr_page);
 }
 
-/*
- * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the entry at offset 'desc->file->f_pos'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent_index(nfs_readdir_descriptor_t *desc)
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
-	struct nfs_entry *entry = desc->entry;
-	int		loop_count = 0,
-			status;
-
-	for(;;) {
-		status = dir_decode(desc);
-		if (status)
-			break;
+	struct page *xdr_page;
+	struct nfs_entry entry;
+	struct file	*file = desc->file;
+	struct nfs_cache_array *array;
+	int status = 0;
+
+	entry.prev_cookie = 0;
+	entry.cookie = *desc->dir_cookie;
+	entry.eof = 0;
+	entry.fh = nfs_alloc_fhandle();
+	entry.fattr = nfs_alloc_fattr();
+	if (entry.fh == NULL || entry.fattr == NULL)
+		goto out;
 
-		dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
-				(unsigned long long)entry->cookie, desc->current_index);
+	array = nfs_readdir_get_array(page);
+	memset(array, 0, sizeof(struct nfs_cache_array));
+	array->eof_index = -1;
 
-		if (desc->file->f_pos == desc->current_index) {
-			*desc->dir_cookie = entry->cookie;
+	xdr_page = alloc_page(GFP_KERNEL);
+	if (!xdr_page)
+		goto out_release_array;
+	do {
+		status = nfs_readdir_xdr_filler(xdr_page, desc, &entry, file, inode);
+		if (status < 0)
 			break;
-		}
-		desc->current_index++;
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
-	}
+		nfs_readdir_page_filler(desc, &entry, xdr_page, page);
+	} while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+
+	put_page(xdr_page);
+out_release_array:
+	nfs_readdir_release_array(page);
+out:
+	nfs_free_fattr(entry.fattr);
+	nfs_free_fhandle(entry.fh);
 	return status;
 }
 
 /*
- * Find the given page, and call find_dirent() or find_dirent_index in
- * order to try to return the next entry.
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later.  This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
  */
-static inline
-int find_dirent_page(nfs_readdir_descriptor_t *desc)
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
 	struct inode	*inode = desc->file->f_path.dentry->d_inode;
-	struct page	*page;
-	int		status;
 
-	dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
-			__func__, desc->page_index,
-			(long long) *desc->dir_cookie);
+	if (nfs_readdir_xdr_to_array(desc, page, inode) == -1)
+		goto error;
+	SetPageUptodate(page);
 
-	/* If we find the page in the page_cache, we cannot be sure
-	 * how fresh the data is, so we will ignore readdir_plus attributes.
-	 */
-	desc->timestamp_valid = 0;
-	page = read_cache_page(inode->i_mapping, desc->page_index,
-			       (filler_t *)nfs_readdir_filler, desc);
-	if (IS_ERR(page)) {
-		status = PTR_ERR(page);
-		goto out;
+	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+		/* Should never happen */
+		nfs_zap_mapping(inode, inode->i_mapping);
 	}
+	unlock_page(page);
+	return 0;
+ error:
+	unlock_page(page);
+	return -EIO;
+}
 
-	/* NOTE: Someone else may have changed the READDIRPLUS flag */
-	desc->page = page;
-	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
-	if (*desc->dir_cookie != 0)
-		status = find_dirent(desc);
-	else
-		status = find_dirent_index(desc);
-	if (status < 0)
-		dir_page_release(desc);
- out:
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
-	return status;
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+	page_cache_release(desc->page);
+	desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	struct page *page;
+	page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+			desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+	if (IS_ERR(page))
+		desc->eof = 1;
+	return page;
 }
 
 /*
- * Recurse through the page cache pages, and return a
- * filled nfs_entry structure of the next directory entry if possible.
- *
- * The target for the search is '*desc->dir_cookie' if non-0,
- * 'desc->file->f_pos' otherwise
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
  */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	int res;
+
+	desc->page = get_cache_page(desc);
+	if (IS_ERR(desc->page))
+		return PTR_ERR(desc->page);
+
+	res = nfs_readdir_search_array(desc);
+	if (res == 0)
+		return 0;
+	cache_page_release(desc);
+	return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-	int		loop_count = 0;
-	int		res;
-
-	/* Always search-by-index from the beginning of the cache */
-	if (*desc->dir_cookie == 0) {
-		dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
-				(long long)desc->file->f_pos);
-		desc->page_index = 0;
-		desc->entry->cookie = desc->entry->prev_cookie = 0;
-		desc->entry->eof = 0;
-		desc->current_index = 0;
-	} else
-		dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
-				(unsigned long long)*desc->dir_cookie);
+	int res = -EAGAIN;
+	desc->page_index = 0;
 
-	for (;;) {
-		res = find_dirent_page(desc);
+	if (*desc->dir_cookie == 0)
+		desc->cache_entry_index = 0;
+
+	while (1) {
+		res = find_cache_page(desc);
 		if (res != -EAGAIN)
 			break;
-		/* Align to beginning of next page */
-		desc->page_index ++;
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
+		desc->page_index++;
 	}
-
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
 	return res;
 }
 
@@ -401,8 +510,6 @@ static inline unsigned int dt_type(struct inode *inode)
 	return (inode->i_mode >> 12) & 15;
 }
 
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
-
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
@@ -411,49 +518,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		   filldir_t filldir)
 {
 	struct file	*file = desc->file;
-	struct nfs_entry *entry = desc->entry;
-	struct dentry	*dentry = NULL;
-	u64		fileid;
-	int		loop_count = 0,
-			res;
-
-	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
-			(unsigned long long)entry->cookie);
-
-	for(;;) {
-		unsigned d_type = DT_UNKNOWN;
-		/* Note: entry->prev_cookie contains the cookie for
-		 *	 retrieving the current dirent on the server */
-		fileid = entry->ino;
-
-		/* Get a dentry if we have one */
-		if (dentry != NULL)
-			dput(dentry);
-		dentry = nfs_readdir_lookup(desc);
-
-		/* Use readdirplus info */
-		if (dentry != NULL && dentry->d_inode != NULL) {
-			d_type = dt_type(dentry->d_inode);
-			fileid = NFS_FILEID(dentry->d_inode);
-		}
+	int i = 0;
+	int res = 0;
+	struct nfs_cache_array *array = NULL;
+	unsigned int d_type = DT_UNKNOWN;
+	struct dentry *dentry = NULL;
 
-		res = filldir(dirent, entry->name, entry->len, 
-			      file->f_pos, nfs_compat_user_ino64(fileid),
-			      d_type);
+	array = nfs_readdir_get_array(desc->page);
+
+	for (i = desc->cache_entry_index; i < array->size; i++) {
+		d_type = DT_UNKNOWN;
+
+		res = filldir(dirent, array->array[i].string.name,
+			array->array[i].string.len, file->f_pos,
+			nfs_compat_user_ino64(array->array[i].ino), d_type);
 		if (res < 0)
 			break;
 		file->f_pos++;
-		*desc->dir_cookie = entry->cookie;
-		if (dir_decode(desc) != 0) {
-			desc->page_index ++;
+		desc->cache_entry_index = i;
+		if (i < (array->size-1))
+			*desc->dir_cookie = array->array[i+1].cookie;
+		else
+			*desc->dir_cookie = array->last_cookie;
+		if (i == array->eof_index) {
+			desc->eof = 1;
 			break;
 		}
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
 	}
-	dir_page_release(desc);
+
+	nfs_readdir_release_array(desc->page);
+	cache_page_release(desc);
 	if (dentry != NULL)
 		dput(dentry);
 	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -477,12 +571,9 @@ static inline
 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		     filldir_t filldir)
 {
-	struct file	*file = desc->file;
-	struct inode	*inode = file->f_path.dentry->d_inode;
-	struct rpc_cred	*cred = nfs_file_cred(file);
 	struct page	*page = NULL;
 	int		status;
-	unsigned long	timestamp, gencount;
+	struct inode *inode = desc->file->f_path.dentry->d_inode;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)*desc->dir_cookie);
@@ -492,38 +583,21 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		status = -ENOMEM;
 		goto out;
 	}
-	timestamp = jiffies;
-	gencount = nfs_inc_attr_generation_counter();
-	status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
-						*desc->dir_cookie, page,
-						NFS_SERVER(inode)->dtsize,
-						desc->plus);
-	desc->page = page;
-	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
-	if (status >= 0) {
-		desc->timestamp = timestamp;
-		desc->gencount = gencount;
-		desc->timestamp_valid = 1;
-		if ((status = dir_decode(desc)) == 0)
-			desc->entry->prev_cookie = *desc->dir_cookie;
-	} else
+
+	if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
 		status = -EIO;
-	if (status < 0)
 		goto out_release;
+	}
 
+	desc->page = page;
 	status = nfs_do_filldir(desc, dirent, filldir);
 
-	/* Reset read descriptor so it searches the page cache from
-	 * the start upon the next call to readdir_search_pagecache() */
-	desc->page_index = 0;
-	desc->entry->cookie = desc->entry->prev_cookie = 0;
-	desc->entry->eof = 0;
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
 			__func__, status);
 	return status;
  out_release:
-	dir_page_release(desc);
+	cache_page_release(desc);
 	goto out;
 }
 
@@ -537,7 +611,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode	*inode = dentry->d_inode;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
-	struct nfs_entry my_entry;
 	int res = -ENOMEM;
 
 	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -558,26 +631,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = NFS_USE_READDIRPLUS(inode);
 
-	my_entry.cookie = my_entry.prev_cookie = 0;
-	my_entry.eof = 0;
-	my_entry.fh = nfs_alloc_fhandle();
-	my_entry.fattr = nfs_alloc_fattr();
-	if (my_entry.fh == NULL || my_entry.fattr == NULL)
-		goto out_alloc_failed;
-
-	desc->entry = &my_entry;
-
 	nfs_block_sillyrename(dentry);
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0)
 		goto out;
 
-	while(!desc->entry->eof) {
+	while (desc->eof != 1) {
 		res = readdir_search_pagecache(desc);
 
 		if (res == -EBADCOOKIE) {
 			/* This means either end of directory */
-			if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+			if (*desc->dir_cookie && desc->eof == 0) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc, dirent, filldir);
 				if (res >= 0)
@@ -590,7 +654,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			nfs_zap_caches(inode);
 			desc->plus = 0;
-			desc->entry->eof = 0;
+			desc->eof = 0;
 			continue;
 		}
 		if (res < 0)
@@ -606,9 +670,6 @@ out:
 	nfs_unblock_sillyrename(dentry);
 	if (res > 0)
 		res = 0;
-out_alloc_failed:
-	nfs_free_fattr(my_entry.fattr);
-	nfs_free_fhandle(my_entry.fh);
 	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			res);
@@ -1292,81 +1353,6 @@ out_err:
 
 #endif /* CONFIG_NFSV4 */
 
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
-{
-	struct dentry *parent = desc->file->f_path.dentry;
-	struct inode *dir = parent->d_inode;
-	struct nfs_entry *entry = desc->entry;
-	struct dentry *dentry, *alias;
-	struct qstr name = {
-		.name = entry->name,
-		.len = entry->len,
-	};
-	struct inode *inode;
-	unsigned long verf = nfs_save_change_attribute(dir);
-
-	switch (name.len) {
-		case 2:
-			if (name.name[0] == '.' && name.name[1] == '.')
-				return dget_parent(parent);
-			break;
-		case 1:
-			if (name.name[0] == '.')
-				return dget(parent);
-	}
-
-	spin_lock(&dir->i_lock);
-	if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
-		spin_unlock(&dir->i_lock);
-		return NULL;
-	}
-	spin_unlock(&dir->i_lock);
-
-	name.hash = full_name_hash(name.name, name.len);
-	dentry = d_lookup(parent, &name);
-	if (dentry != NULL) {
-		/* Is this a positive dentry that matches the readdir info? */
-		if (dentry->d_inode != NULL &&
-				(NFS_FILEID(dentry->d_inode) == entry->ino ||
-				d_mountpoint(dentry))) {
-			if (!desc->plus || entry->fh->size == 0)
-				return dentry;
-			if (nfs_compare_fh(NFS_FH(dentry->d_inode),
-						entry->fh) == 0)
-				goto out_renew;
-		}
-		/* No, so d_drop to allow one to be created */
-		d_drop(dentry);
-		dput(dentry);
-	}
-	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
-		return NULL;
-	if (name.len > NFS_SERVER(dir)->namelen)
-		return NULL;
-	/* Note: caller is already holding the dir->i_mutex! */
-	dentry = d_alloc(parent, &name);
-	if (dentry == NULL)
-		return NULL;
-	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
-	if (IS_ERR(inode)) {
-		dput(dentry);
-		return NULL;
-	}
-
-	alias = d_materialise_unique(dentry, inode);
-	if (alias != NULL) {
-		dput(dentry);
-		if (IS_ERR(alias))
-			return NULL;
-		dentry = alias;
-	}
-
-out_renew:
-	nfs_set_verifier(dentry, verf);
-	return dentry;
-}
-
 /*
  * Code common to create, mkdir, and mknod.
  */
-- 
cgit v1.2.3


From baf57a09e9d87b14be5e2788828169394a2525ab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 24 Sep 2010 18:49:43 -0400
Subject: NFS: Optimise the readdir searches

If we're going through the loop in nfs_readdir() more than once, we usually
do not want to restart searching from the beginning of the pages cache.

We only want to do that if the previous search failed...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f77243907a08..33b0ce7a97be 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -491,10 +491,6 @@ static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
 	int res = -EAGAIN;
-	desc->page_index = 0;
-
-	if (*desc->dir_cookie == 0)
-		desc->cache_entry_index = 0;
 
 	while (1) {
 		res = find_cache_page(desc);
@@ -589,6 +585,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		goto out_release;
 	}
 
+	desc->page_index = 0;
 	desc->page = page;
 	status = nfs_do_filldir(desc, dirent, filldir);
 
@@ -653,6 +650,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (res == -ETOOSMALL && desc->plus) {
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			nfs_zap_caches(inode);
+			desc->page_index = 0;
 			desc->plus = 0;
 			desc->eof = 0;
 			continue;
-- 
cgit v1.2.3


From d39ab9de3b80da5835049b1c3b49da4e84e01c07 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 24 Sep 2010 18:50:01 -0400
Subject: NFS: re-add readdir plus

This patch adds readdir plus support to the cache array.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 33b0ce7a97be..fd30f185ec01 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -370,6 +370,71 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, __be32 *
 	return 0;
 }
 
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
+{
+	struct nfs_inode *node;
+	if (dentry->d_inode == NULL)
+		goto different;
+	node = NFS_I(dentry->d_inode);
+	if (node->fh.size != entry->fh->size)
+		goto different;
+	if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+		goto different;
+	return 1;
+different:
+	return 0;
+}
+
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
+{
+	struct qstr filename;
+	struct dentry *dentry = NULL;
+	struct dentry *alias = NULL;
+	struct inode *dir = parent->d_inode;
+	struct inode *inode;
+
+	nfs_readdir_make_qstr(&filename, entry->name, entry->len);
+	if (filename.len == 1 && filename.name[0] == '.')
+		dentry = dget(parent);
+	else if (filename.len == 2 && filename.name[0] == '.'
+				   && filename.name[1] == '.')
+		dentry = dget_parent(parent);
+	else
+		dentry = d_lookup(parent, &filename);
+
+	if (dentry != NULL) {
+		if (nfs_same_file(dentry, entry)) {
+			nfs_refresh_inode(dentry->d_inode, entry->fattr);
+			goto out;
+		} else {
+			d_drop(dentry);
+			dput(dentry);
+		}
+	}
+
+	dentry = d_alloc(parent, &filename);
+	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+	if (IS_ERR(inode))
+		goto out;
+
+	alias = d_materialise_unique(dentry, inode);
+	if (IS_ERR(alias))
+		goto out;
+	else if (alias) {
+		nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+		dput(alias);
+	} else
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+out:
+	dput(dentry);
+	kfree(filename.name);
+	return;
+}
+
 /* Perform conversion from xdr to cache array */
 static
 void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
@@ -379,6 +444,8 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
 	while (xdr_decode(desc, entry, &ptr) == 0) {
 		if (nfs_readdir_add_to_array(entry, page) == -1)
 			break;
+		if (desc->plus == 1)
+			nfs_prime_dcache(desc->file->f_path.dentry, entry);
 	}
 	kunmap(xdr_page);
 }
-- 
cgit v1.2.3


From 0715dc632a271fc0fedf3ef4779fe28ac1e53ef4 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Fri, 24 Sep 2010 18:50:01 -0400
Subject: NFS: remove readdir plus limit

We will now use readdir plus even on directories that are very large.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f2d2c801e0af..6eec28656415 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -234,9 +234,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
 	return 0;
 }
 
-/* Don't use READDIRPLUS on directories that we believe are too large */
-#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
-
 /*
  * This is our front-end to iget that looks up inodes by file handle
  * instead of inode number.
@@ -291,8 +288,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
-			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
-			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
+			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
 			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
-- 
cgit v1.2.3


From babddc72a9468884ce1a23db3c3d54b0afa299f0 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 20 Oct 2010 15:44:29 -0400
Subject: NFS: decode_dirent should use an xdr_stream

Convert nfs*xdr.c to use an xdr stream in decode_dirent.  This will prevent a
kernel oops that has been occuring when reading a vmapped page.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            | 29 ++++++++++-----
 fs/nfs/internal.h       |  6 ++--
 fs/nfs/nfs2xdr.c        | 39 ++++++++++++++++++---
 fs/nfs/nfs3xdr.c        | 93 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/nfs/nfs4_fs.h        |  2 +-
 fs/nfs/nfs4xdr.c        | 71 ++++++++++++++++++++++++++++---------
 include/linux/nfs_xdr.h |  2 +-
 7 files changed, 202 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fd30f185ec01..88cbcda76856 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -171,7 +171,7 @@ struct nfs_cache_array {
 
 #define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
 
-typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
@@ -357,13 +357,11 @@ error:
 
 /* Fill in an entry based on the xdr code stored in desc->page */
 static
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, __be32 **ptr)
+int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-	__be32	*p = *ptr;
-	p = desc->decode(p, entry, desc->plus);
+	__be32 *p = desc->decode(stream, entry, desc->plus);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
-	*ptr = p;
 
 	entry->fattr->time_start = desc->timestamp;
 	entry->fattr->gencount = desc->gencount;
@@ -438,10 +436,23 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-				struct page *xdr_page, struct page *page)
+				struct page *xdr_page, struct page *page, unsigned int buflen)
 {
+	struct xdr_stream stream;
+	struct xdr_buf buf;
 	__be32 *ptr = kmap(xdr_page);
-	while (xdr_decode(desc, entry, &ptr) == 0) {
+
+	buf.head->iov_base = xdr_page;
+	buf.head->iov_len = buflen;
+	buf.tail->iov_len = 0;
+	buf.page_base = 0;
+	buf.page_len = 0;
+	buf.buflen = buf.head->iov_len;
+	buf.len = buf.head->iov_len;
+
+	xdr_init_decode(&stream, &buf, ptr);
+
+	while (xdr_decode(desc, entry, &stream) == 0) {
 		if (nfs_readdir_add_to_array(entry, page) == -1)
 			break;
 		if (desc->plus == 1)
@@ -458,6 +469,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	struct file	*file = desc->file;
 	struct nfs_cache_array *array;
 	int status = 0;
+	unsigned int array_size = 1;
 
 	entry.prev_cookie = 0;
 	entry.cookie = *desc->dir_cookie;
@@ -476,9 +488,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		goto out_release_array;
 	do {
 		status = nfs_readdir_xdr_filler(xdr_page, desc, &entry, file, inode);
+
 		if (status < 0)
 			break;
-		nfs_readdir_page_filler(desc, &entry, xdr_page, page);
+		nfs_readdir_page_filler(desc, &entry, xdr_page, page, array_size * PAGE_SIZE);
 	} while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
 
 	put_page(xdr_page);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..74b015598a43 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -181,15 +181,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, int);
 
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, int);
 
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 79c74387a2fe..0210c752e743 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -500,25 +500,56 @@ err_unmap:
 	goto out;
 }
 
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 __be32 *
-nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus)
 {
-	if (!*p++) {
-		if (!*p)
+	__be32 *p;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (!ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!ntohl(*p++))
 			return ERR_PTR(-EAGAIN);
 		entry->eof = 1;
 		return ERR_PTR(-EBADCOOKIE);
 	}
 
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
 	entry->ino	  = ntohl(*p++);
 	entry->len	  = ntohl(*p++);
+
+	p = xdr_inline_decode(xdr, entry->len + 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->name	  = (const char *) p;
 	p		 += XDR_QUADLEN(entry->len);
 	entry->prev_cookie	  = entry->cookie;
 	entry->cookie	  = ntohl(*p++);
-	entry->eof	  = !p[0] && p[1];
+
+	p = xdr_inline_peek(xdr, 8);
+	if (p != NULL)
+		entry->eof = !p[0] && p[1];
+	else
+		entry->eof = 0;
 
 	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
 }
 
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 52b2fda66e63..d562c8d9d56e 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
 	[NF3FIFO] = S_IFIFO,
 };
 
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 /*
  * Common NFS XDR functions as inlines
  */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 	return NULL;
 }
 
+static inline __be32 *
+xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	fh->size = ntohl(*p++);
+
+	if (fh->size <= NFS3_FHSIZE) {
+		p = xdr_inline_decode(xdr, fh->size);
+		if (unlikely(!p))
+			goto out_overflow;
+		memcpy(fh->data, p, fh->size);
+		return p + XDR_QUADLEN(fh->size);
+	}
+	return NULL;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
+}
+
 /*
  * Encode/decode time.
  */
@@ -240,6 +270,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
 	return p;
 }
 
+static inline __be32 *
+xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 84);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_fattr(p, fattr);
+	}
+	return p;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
+}
+
 static inline __be32 *
 xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
@@ -616,19 +666,33 @@ err_unmap:
 }
 
 __be32 *
-nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus)
 {
+	__be32 *p;
 	struct nfs_entry old = *entry;
 
-	if (!*p++) {
-		if (!*p)
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (!ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!ntohl(*p++))
 			return ERR_PTR(-EAGAIN);
 		entry->eof = 1;
 		return ERR_PTR(-EBADCOOKIE);
 	}
 
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &entry->ino);
 	entry->len  = ntohl(*p++);
+
+	p = xdr_inline_decode(xdr, entry->len + 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->name = (const char *) p;
 	p += XDR_QUADLEN(entry->len);
 	entry->prev_cookie = entry->cookie;
@@ -636,10 +700,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 
 	if (plus) {
 		entry->fattr->valid = 0;
-		p = xdr_decode_post_op_attr(p, entry->fattr);
+		p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
+		if (IS_ERR(p))
+			goto out_overflow_exit;
 		/* In fact, a post_op_fh3: */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (*p++) {
-			p = xdr_decode_fhandle(p, entry->fh);
+			p = xdr_decode_fhandle_stream(xdr, entry->fh);
+			if (IS_ERR(p))
+				goto out_overflow_exit;
 			/* Ugh -- server reply was truncated */
 			if (p == NULL) {
 				dprintk("NFS: FH truncated\n");
@@ -650,8 +721,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 			memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
 	}
 
-	entry->eof = !p[0] && p[1];
+	p = xdr_inline_peek(xdr, 8);
+	if (p != NULL)
+		entry->eof = !p[0] && p[1];
+	else
+		entry->eof = 0;
+
 	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+out_overflow_exit:
+	return ERR_PTR(-EIO);
 }
 
 /*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index d24a8e07b5e2..c58ea6377506 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -331,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6ea5c9392fe4..a4919e999354 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3950,13 +3950,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	__be32 *p;
 	uint32_t namelen, type;
 
-	p = xdr_inline_decode(xdr, 32);
+	p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
 	if (unlikely(!p))
 		goto out_overflow;
-	p = xdr_decode_hyper(p, &offset);
+	p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
 	p = xdr_decode_hyper(p, &length);
-	type = be32_to_cpup(p++);
-	if (fl != NULL) {
+	type = be32_to_cpup(p++); /* 4 byte read */
+	if (fl != NULL) { /* manipulate file lock */
 		fl->fl_start = (loff_t)offset;
 		fl->fl_end = fl->fl_start + (loff_t)length - 1;
 		if (length == ~(uint64_t)0)
@@ -3966,9 +3966,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 			fl->fl_type = F_RDLCK;
 		fl->fl_pid = 0;
 	}
-	p = xdr_decode_hyper(p, &clientid);
-	namelen = be32_to_cpup(p);
-	p = xdr_inline_decode(xdr, namelen);
+	p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
+	namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
+	p = xdr_inline_decode(xdr, namelen); /* variable size field */
 	if (likely(p))
 		return -NFS4ERR_DENIED;
 out_overflow:
@@ -5755,21 +5755,33 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
 	uint32_t len;
-
-	if (!*p++) {
-		if (!*p)
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (!ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!ntohl(*p++))
 			return ERR_PTR(-EAGAIN);
 		entry->eof = 1;
 		return ERR_PTR(-EBADCOOKIE);
 	}
 
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->prev_cookie = entry->cookie;
 	p = xdr_decode_hyper(p, &entry->cookie);
 	entry->len = ntohl(*p++);
+
+	p = xdr_inline_decode(xdr, entry->len + 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->name = (const char *) p;
 	p += XDR_QUADLEN(entry->len);
 
@@ -5782,29 +5794,54 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 
 	len = ntohl(*p++);		/* bitmap length */
 	if (len-- > 0) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		bitmap[0] = ntohl(*p++);
 		if (len-- > 0) {
+			p = xdr_inline_decode(xdr, 4);
+			if (unlikely(!p))
+				goto out_overflow;
 			bitmap[1] = ntohl(*p++);
 			p += len;
 		}
 	}
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = XDR_QUADLEN(ntohl(*p++));	/* attribute buffer length */
 	if (len > 0) {
 		if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
 			bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
 			/* Ignore the return value of rdattr_error for now */
-			p++;
-			len--;
+			p = xdr_inline_decode(xdr, 4);
+			if (unlikely(!p))
+				goto out_overflow;
 		}
-		if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID)
+		if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) {
+			p = xdr_inline_decode(xdr, 8);
+			if (unlikely(!p))
+				goto out_overflow;
 			xdr_decode_hyper(p, &entry->ino);
-		else if (bitmap[0] == FATTR4_WORD0_FILEID)
+		} else if (bitmap[0] == FATTR4_WORD0_FILEID) {
+			p = xdr_inline_decode(xdr, 8);
+			if (unlikely(!p))
+				goto out_overflow;
 			xdr_decode_hyper(p, &entry->ino);
-		p += len;
+		}
 	}
 
-	entry->eof = !p[0] && p[1];
+	p = xdr_inline_peek(xdr, 8);
+	if (p != NULL)
+		entry->eof = !p[0] && p[1];
+	else
+		entry->eof = 0;
+
 	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
 }
 
 /*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5772b2c2f063..ca0e8fd7feec 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1036,7 +1036,7 @@ struct nfs_rpc_ops {
 	int	(*pathconf) (struct nfs_server *, struct nfs_fh *,
 			     struct nfs_pathconf *);
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
-	__be32 *(*decode_dirent)(__be32 *, struct nfs_entry *, int plus);
+	__be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int plus);
 	void	(*read_setup)   (struct nfs_read_data *, struct rpc_message *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
-- 
cgit v1.2.3


From afa8ccc978c24d8ab22e3b3b8cbd1054c84c070b Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 20 Oct 2010 15:44:31 -0400
Subject: NFS: remove page size checking code

Remove the page size checking code for a readdir decode.  This is now done
by decode_dirent with xdr_streams.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c | 54 ---------------------------------------
 fs/nfs/nfs3xdr.c | 78 +-------------------------------------------------------
 fs/nfs/nfs4xdr.c | 68 +-----------------------------------------------
 3 files changed, 2 insertions(+), 198 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 0210c752e743..82f026422424 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	struct page **page;
 	size_t hdrlen;
 	unsigned int pglen, recvd;
-	u32 len;
 	int status, nr = 0;
-	__be32 *end, *entry, *kaddr;
 
 	if ((status = ntohl(*p++)))
 		return nfs_stat_to_errno(status);
@@ -445,59 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	if (pglen > recvd)
 		pglen = recvd;
 	page = rcvbuf->pages;
-	kaddr = p = kmap_atomic(*page, KM_USER0);
-	end = (__be32 *)((char *)p + pglen);
-	entry = p;
-
-	/* Make sure the packet actually has a value_follows and EOF entry */
-	if ((entry + 1) > end)
-		goto short_pkt;
-
-	for (; *p++; nr++) {
-		if (p + 2 > end)
-			goto short_pkt;
-		p++; /* fileid */
-		len = ntohl(*p++);
-		p += XDR_QUADLEN(len) + 1;	/* name plus cookie */
-		if (len > NFS2_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-						len);
-			goto err_unmap;
-		}
-		if (p + 2 > end)
-			goto short_pkt;
-		entry = p;
-	}
-
-	/*
-	 * Apparently some server sends responses that are a valid size, but
-	 * contain no entries, and have value_follows==0 and EOF==0. For
-	 * those, just set the EOF marker.
-	 */
-	if (!nr && entry[1] == 0) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
- out:
-	kunmap_atomic(kaddr, KM_USER0);
 	return nr;
- short_pkt:
-	/*
-	 * When we get a short packet there are 2 possibilities. We can
-	 * return an error, or fix up the response to look like a valid
-	 * response and return what we have so far. If there are no
-	 * entries and the packet was short, then return -EIO. If there
-	 * are valid entries in the response, return them and pretend that
-	 * the call was successful, but incomplete. The caller can retry the
-	 * readdir starting at the last cookie.
-	 */
-	entry[0] = entry[1] = 0;
-	if (!nr)
-		nr = -errno_NFSERR_IO;
-	goto out;
-err_unmap:
-	nr = -errno_NFSERR_IO;
-	goto out;
 }
 
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d562c8d9d56e..dc98eb7976c3 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -554,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	struct kvec *iov = rcvbuf->head;
 	struct page **page;
 	size_t hdrlen;
-	u32 len, recvd, pglen;
+	u32 recvd, pglen;
 	int status, nr = 0;
-	__be32 *entry, *end, *kaddr;
 
 	status = ntohl(*p++);
 	/* Decode post_op_attrs */
@@ -586,83 +585,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	if (pglen > recvd)
 		pglen = recvd;
 	page = rcvbuf->pages;
-	kaddr = p = kmap_atomic(*page, KM_USER0);
-	end = (__be32 *)((char *)p + pglen);
-	entry = p;
-
-	/* Make sure the packet actually has a value_follows and EOF entry */
-	if ((entry + 1) > end)
-		goto short_pkt;
-
-	for (; *p++; nr++) {
-		if (p + 3 > end)
-			goto short_pkt;
-		p += 2;				/* inode # */
-		len = ntohl(*p++);		/* string length */
-		p += XDR_QUADLEN(len) + 2;	/* name + cookie */
-		if (len > NFS3_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-						len);
-			goto err_unmap;
-		}
-
-		if (res->plus) {
-			/* post_op_attr */
-			if (p + 2 > end)
-				goto short_pkt;
-			if (*p++) {
-				p += 21;
-				if (p + 1 > end)
-					goto short_pkt;
-			}
-			/* post_op_fh3 */
-			if (*p++) {
-				if (p + 1 > end)
-					goto short_pkt;
-				len = ntohl(*p++);
-				if (len > NFS3_FHSIZE) {
-					dprintk("NFS: giant filehandle in "
-						"readdir (len 0x%x)!\n", len);
-					goto err_unmap;
-				}
-				p += XDR_QUADLEN(len);
-			}
-		}
-
-		if (p + 2 > end)
-			goto short_pkt;
-		entry = p;
-	}
 
-	/*
-	 * Apparently some server sends responses that are a valid size, but
-	 * contain no entries, and have value_follows==0 and EOF==0. For
-	 * those, just set the EOF marker.
-	 */
-	if (!nr && entry[1] == 0) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
- out:
-	kunmap_atomic(kaddr, KM_USER0);
 	return nr;
- short_pkt:
-	/*
-	 * When we get a short packet there are 2 possibilities. We can
-	 * return an error, or fix up the response to look like a valid
-	 * response and return what we have so far. If there are no
-	 * entries and the packet was short, then return -EIO. If there
-	 * are valid entries in the response, return them and pretend that
-	 * the call was successful, but incomplete. The caller can retry the
-	 * readdir starting at the last cookie.
-	 */
-	entry[0] = entry[1] = 0;
-	if (!nr)
-		nr = -errno_NFSERR_IO;
-	goto out;
-err_unmap:
-	nr = -errno_NFSERR_IO;
-	goto out;
 }
 
 __be32 *
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a4919e999354..8346e977d837 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4200,12 +4200,9 @@ out_overflow:
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
 {
 	struct xdr_buf	*rcvbuf = &req->rq_rcv_buf;
-	struct page	*page = *rcvbuf->pages;
 	struct kvec	*iov = rcvbuf->head;
 	size_t		hdrlen;
 	u32		recvd, pglen = rcvbuf->page_len;
-	__be32		*end, *entry, *p, *kaddr;
-	unsigned int	nr = 0;
 	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4222,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 		pglen = recvd;
 	xdr_read_pages(xdr, pglen);
 
-	BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-	kaddr = p = kmap_atomic(page, KM_USER0);
-	end = p + ((pglen + readdir->pgbase) >> 2);
-	entry = p;
-
-	/* Make sure the packet actually has a value_follows and EOF entry */
-	if ((entry + 1) > end)
-		goto short_pkt;
-
-	for (; *p++; nr++) {
-		u32 len, attrlen, xlen;
-		if (end - p < 3)
-			goto short_pkt;
-		dprintk("cookie = %Lu, ", *((unsigned long long *)p));
-		p += 2;			/* cookie */
-		len = ntohl(*p++);	/* filename length */
-		if (len > NFS4_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len 0x%x)\n",
-					len);
-			goto err_unmap;
-		}
-		xlen = XDR_QUADLEN(len);
-		if (end - p < xlen + 1)
-			goto short_pkt;
-		dprintk("filename = %*s\n", len, (char *)p);
-		p += xlen;
-		len = ntohl(*p++);	/* bitmap length */
-		if (end - p < len + 1)
-			goto short_pkt;
-		p += len;
-		attrlen = XDR_QUADLEN(ntohl(*p++));
-		if (end - p < attrlen + 2)
-			goto short_pkt;
-		p += attrlen;		/* attributes */
-		entry = p;
-	}
-	/*
-	 * Apparently some server sends responses that are a valid size, but
-	 * contain no entries, and have value_follows==0 and EOF==0. For
-	 * those, just set the EOF marker.
-	 */
-	if (!nr && entry[1] == 0) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
-out:
-	kunmap_atomic(kaddr, KM_USER0);
+
 	return 0;
-short_pkt:
-	/*
-	 * When we get a short packet there are 2 possibilities. We can
-	 * return an error, or fix up the response to look like a valid
-	 * response and return what we have so far. If there are no
-	 * entries and the packet was short, then return -EIO. If there
-	 * are valid entries in the response, return them and pretend that
-	 * the call was successful, but incomplete. The caller can retry the
-	 * readdir starting at the last cookie.
-	 */
-	dprintk("%s: short packet at entry %d\n", __func__, nr);
-	entry[0] = entry[1] = 0;
-	if (nr)
-		goto out;
-err_unmap:
-	kunmap_atomic(kaddr, KM_USER0);
-	return -errno_NFSERR_IO;
 }
 
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
-- 
cgit v1.2.3


From 56e4ebf877b6043c289bda32a5a7385b80c17dee Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 20 Oct 2010 15:44:37 -0400
Subject: NFS: readdir with vmapped pages

We can use vmapped pages to read more information from the network at once.
This will reduce the number of calls needed to complete a readdir.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
[trondmy: Added #include for linux/vmalloc.h> in fs/nfs/dir.c]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c         |  4 +--
 fs/nfs/dir.c            | 66 ++++++++++++++++++++++++++++++++++++++++---------
 fs/nfs/internal.h       |  6 +++++
 fs/nfs/nfs3proc.c       |  4 +--
 fs/nfs/nfs4proc.c       |  8 +++---
 fs/nfs/proc.c           |  4 +--
 include/linux/nfs_xdr.h |  2 +-
 7 files changed, 71 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5f01f42b3991..876ba8592706 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -903,8 +903,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-	if (server->dtsize > PAGE_CACHE_SIZE)
-		server->dtsize = PAGE_CACHE_SIZE;
+	if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
+		server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
 	if (server->dtsize > server->rsize)
 		server->dtsize = server->rsize;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 88cbcda76856..5d7da3ad4e85 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
 #include "delegation.h"
 #include "iostat.h"
@@ -327,7 +328,7 @@ out:
 
 /* Fill a page with xdr information before transferring to the cache page */
 static
-int nfs_readdir_xdr_filler(struct page *xdr_page, nfs_readdir_descriptor_t *desc,
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 			struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
 	struct rpc_cred	*cred = nfs_file_cred(file);
@@ -337,7 +338,7 @@ int nfs_readdir_xdr_filler(struct page *xdr_page, nfs_readdir_descriptor_t *desc
  again:
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
-	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, xdr_page,
+	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
 					  NFS_SERVER(inode)->dtsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
@@ -436,11 +437,11 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-				struct page *xdr_page, struct page *page, unsigned int buflen)
+				void *xdr_page, struct page *page, unsigned int buflen)
 {
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	__be32 *ptr = kmap(xdr_page);
+	__be32 *ptr = xdr_page;
 
 	buf.head->iov_base = xdr_page;
 	buf.head->iov_len = buflen;
@@ -458,18 +459,59 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
 		if (desc->plus == 1)
 			nfs_prime_dcache(desc->file->f_path.dentry, entry);
 	}
-	kunmap(xdr_page);
+}
+
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+	for (i = 0; i < npages; i++)
+		put_page(pages[i]);
+}
+
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+		unsigned int npages)
+{
+	vm_unmap_ram(ptr, npages);
+	nfs_readdir_free_pagearray(pages, npages);
+}
+
+/*
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
+ * to nfs_readdir_free_large_page
+ */
+static
+void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+{
+	void *ptr;
+	unsigned int i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = alloc_page(GFP_KERNEL);
+		if (page == NULL)
+			goto out_freepages;
+		pages[i] = page;
+	}
+
+	ptr = vm_map_ram(pages, NFS_MAX_READDIR_PAGES, 0, PAGE_KERNEL);
+	if (!IS_ERR_OR_NULL(ptr))
+		return ptr;
+out_freepages:
+	nfs_readdir_free_pagearray(pages, i);
+	return NULL;
 }
 
 static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
-	struct page *xdr_page;
+	struct page *pages[NFS_MAX_READDIR_PAGES];
+	void *pages_ptr = NULL;
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
 	struct nfs_cache_array *array;
 	int status = 0;
-	unsigned int array_size = 1;
+	unsigned int array_size = ARRAY_SIZE(pages);
 
 	entry.prev_cookie = 0;
 	entry.cookie = *desc->dir_cookie;
@@ -483,18 +525,18 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	array->eof_index = -1;
 
-	xdr_page = alloc_page(GFP_KERNEL);
-	if (!xdr_page)
+	pages_ptr = nfs_readdir_large_page(pages, array_size);
+	if (!pages_ptr)
 		goto out_release_array;
 	do {
-		status = nfs_readdir_xdr_filler(xdr_page, desc, &entry, file, inode);
+		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
 
 		if (status < 0)
 			break;
-		nfs_readdir_page_filler(desc, &entry, xdr_page, page, array_size * PAGE_SIZE);
+		nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
 	} while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
 
-	put_page(xdr_page);
+	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
 	nfs_readdir_release_array(page);
 out:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74b015598a43..7b0e894d00c8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -62,6 +62,12 @@ struct nfs_clone_mount {
  */
 #define NFS_UNSPEC_PORT		(-1)
 
+/*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+
 /*
  * In-kernel mount arguments
  */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f8446b38dcb4..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -630,7 +630,7 @@ out:
  */
 static int
 nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-		  u64 cookie, struct page *page, unsigned int count, int plus)
+		  u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
 	__be32			*verf = NFS_COOKIEVERF(dir);
@@ -640,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.verf		= {verf[0], verf[1]},
 		.plus		= plus,
 		.count		= count,
-		.pages		= &page
+		.pages		= pages
 	};
 	struct nfs3_readdirres	res = {
 		.verf		= verf,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aa771c1af115..cb33c73206e3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2823,12 +2823,12 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 }
 
 static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+		u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
-		.pages = &page,
+		.pages = pages,
 		.pgbase = 0,
 		.count = count,
 		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
@@ -2859,14 +2859,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 
 static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+		u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
 				_nfs4_proc_readdir(dentry, cred, cookie,
-					page, count, plus),
+					pages, count, plus),
 				&exception);
 	} while (exception.retry);
 	return err;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e5e84aa2af17..58e7f84fc1fd 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -534,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
  */
 static int
 nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-		 u64 cookie, struct page *page, unsigned int count, int plus)
+		 u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
 	struct nfs_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
 		.cookie		= cookie,
 		.count		= count,
-		.pages		= &page,
+		.pages		= pages,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ca0e8fd7feec..1b9a17a1f235 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1026,7 +1026,7 @@ struct nfs_rpc_ops {
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, struct qstr *);
 	int	(*readdir) (struct dentry *, struct rpc_cred *,
-			    u64, struct page *, unsigned int, int);
+			    u64, struct page **, unsigned int, int);
 	int	(*mknod)   (struct inode *, struct dentry *, struct iattr *,
 			    dev_t);
 	int	(*statfs)  (struct nfs_server *, struct nfs_fh *,
-- 
cgit v1.2.3


From 3c8a1aeed8fd7f89bd0400fad72cbc1ac3460217 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 21 Oct 2010 16:33:16 -0400
Subject: NFS: nfs_readdir_filler catch all errors

Check for all errors, not a specific one.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d7da3ad4e85..7b8b7c59db9f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -556,7 +556,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
 	struct inode	*inode = desc->file->f_path.dentry->d_inode;
 
-	if (nfs_readdir_xdr_to_array(desc, page, inode) == -1)
+	if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
 		goto error;
 	SetPageUptodate(page);
 
-- 
cgit v1.2.3


From 9942438089d5c0e3adecdcb7bc360b8fe0ce7e62 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 21 Oct 2010 16:33:16 -0400
Subject: NFS: check xdr_decode for errors

Check if the decoded entry has the eof bit set when returning from xdr_decode
with an error.  If it does, we should set the eof bits in the array before
returning.  This should keep us from looping when we expect more data but the
server doesn't give us anything new.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7b8b7c59db9f..0cbb714a09d8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -442,6 +442,8 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
 	struct xdr_stream stream;
 	struct xdr_buf buf;
 	__be32 *ptr = xdr_page;
+	int status;
+	struct nfs_cache_array *array;
 
 	buf.head->iov_base = xdr_page;
 	buf.head->iov_len = buflen;
@@ -453,11 +455,23 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
 
 	xdr_init_decode(&stream, &buf, ptr);
 
-	while (xdr_decode(desc, entry, &stream) == 0) {
+
+	do {
+		status = xdr_decode(desc, entry, &stream);
+		if (status != 0)
+			break;
+
 		if (nfs_readdir_add_to_array(entry, page) == -1)
 			break;
 		if (desc->plus == 1)
 			nfs_prime_dcache(desc->file->f_path.dentry, entry);
+	} while (!entry->eof);
+
+	if (status == -EBADCOOKIE && entry->eof) {
+		array = nfs_readdir_get_array(page);
+		array->eof_index = array->size - 1;
+		status = 0;
+		nfs_readdir_release_array(page);
 	}
 }
 
-- 
cgit v1.2.3


From ae42c70a60fe330d9c2af7c4b92ce78484308e37 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 21 Oct 2010 16:33:17 -0400
Subject: NFS: introduce generic decode_getattr function

Getattr should be able to decode errors and the readdir file handle.
decode_getfattr_attrs does the actual attribute decoding, while
decode_getfattr_generic will check the opcode before decoding.  This will
let other functions call decode_getfattr_attrs to decode their attributes.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 98 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8346e977d837..b7eff205d3d8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2848,6 +2848,58 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+	__be32 *p;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+	__be32 *p;
+	int len;
+
+	if (fh == NULL) {
+		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+		return 0;
+	}
+
+	memset(fh, 0, sizeof(*fh));
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		if (len > NFS4_FHSIZE)
+			return -EIO;
+		fh->size = len;
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		memcpy(fh->data, p, len);
+		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
 	__be32 *p;
@@ -3744,29 +3796,14 @@ xdr_error:
 	return status;
 }
 
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_fattr *fattr, struct nfs_fh *fh,
 		const struct nfs_server *server, int may_sleep)
 {
-	__be32 *savep;
-	uint32_t attrlen,
-		 bitmap[2] = {0},
-		 type;
 	int status;
 	umode_t fmode = 0;
 	uint64_t fileid;
-
-	status = decode_op_hdr(xdr, OP_GETATTR);
-	if (status < 0)
-		goto xdr_error;
-
-	status = decode_attr_bitmap(xdr, bitmap);
-	if (status < 0)
-		goto xdr_error;
-
-	status = decode_attr_length(xdr, &attrlen, &savep);
-	if (status < 0)
-		goto xdr_error;
-
+	uint32_t type;
 
 	status = decode_attr_type(xdr, bitmap, &type);
 	if (status < 0)
@@ -3792,6 +3829,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = decode_attr_error(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_filehandle(xdr, bitmap, fh);
+	if (status < 0)
+		goto xdr_error;
+
 	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
 	if (status < 0)
 		goto xdr_error;
@@ -3857,17 +3902,51 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
 	if (status < 0)
 		goto xdr_error;
-	if (status != 0 && !(fattr->valid & status)) {
+	if (status != 0) {
 		fattr->fileid = fileid;
 		fattr->valid |= status;
 	}
 
+xdr_error:
+	dprintk("%s: xdr returned %d\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+{
+	__be32 *savep;
+	uint32_t attrlen,
+		 bitmap[2] = {0};
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETATTR);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+	if (status < 0)
+		goto xdr_error;
+
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
 }
 
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		const struct nfs_server *server, int may_sleep)
+{
+	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+}
 
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
-- 
cgit v1.2.3


From 82f2e5472e2304e531c2fa85e457f4a71070044e Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Thu, 21 Oct 2010 16:33:18 -0400
Subject: NFS: Readdir plus in v4

By requsting more attributes during a readdir, we can mimic the readdir plus
operation that was in NFSv3.

To test, I ran the command `ls -lU --color=none` on directories with various
numbers of files.  Without readdir plus, I see this:

n files |    100    |   1,000   |  10,000   |  100,000  | 1,000,000
--------+-----------+-----------+-----------+-----------+----------
real    | 0m00.153s | 0m00.589s | 0m05.601s | 0m56.691s | 9m59.128s
user    | 0m00.007s | 0m00.007s | 0m00.077s | 0m00.703s | 0m06.800s
sys     | 0m00.010s | 0m00.070s | 0m00.633s | 0m06.423s | 1m10.005s
access  | 3         | 1         | 1         | 4         | 31
getattr | 2         | 1         | 1         | 1         | 1
lookup  | 104       | 1,003     | 10,003    | 100,003   | 1,000,003
readdir | 2         | 16        | 158       | 1,575     | 15,749
total   | 111       | 1,021     | 10,163    | 101,583   | 1,015,784

With readdir plus enabled, I see this:

n files |    100    |   1,000   |  10,000   |  100,000  | 1,000,000
--------+-----------+-----------+-----------+-----------+----------
real    | 0m00.115s | 0m00.206s | 0m01.079s | 0m12.521s | 2m07.528s
user    | 0m00.003s | 0m00.003s | 0m00.040s | 0m00.290s | 0m03.296s
sys     | 0m00.007s | 0m00.020s | 0m00.120s | 0m01.357s | 0m17.556s
access  | 3         | 1         | 1         | 1         | 7
getattr | 2         | 1         | 1         | 1         | 1
lookup  | 4         | 3         | 3         | 3         | 3
readdir | 6         | 62        | 630       | 6,300     | 62,993
total   | 15        | 67        | 635       | 6,305     | 63,004

Readdir plus disabled has about a 16x increase in the number of rpc calls and
is 4 - 5 times slower on large directories.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c         |  5 +++--
 fs/nfs/dir.c            |  4 ++--
 fs/nfs/internal.h       |  6 +++---
 fs/nfs/nfs2xdr.c        |  2 +-
 fs/nfs/nfs3xdr.c        |  2 +-
 fs/nfs/nfs4_fs.h        |  2 +-
 fs/nfs/nfs4proc.c       |  1 +
 fs/nfs/nfs4xdr.c        | 55 ++++++++++++++++++++++++-------------------------
 include/linux/nfs_xdr.h |  3 ++-
 9 files changed, 41 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 876ba8592706..da2f2f024a4d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1358,8 +1358,9 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
-		NFS_CAP_POSIX_LOCK;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
+	if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
 	server->options = data->options;
 
 	/* Get a client record */
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0cbb714a09d8..2a768d05a534 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -172,7 +172,7 @@ struct nfs_cache_array {
 
 #define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
 
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
+typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
@@ -360,7 +360,7 @@ error:
 static
 int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-	__be32 *p = desc->decode(stream, entry, desc->plus);
+	__be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b0e894d00c8..db08ff3ff454 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -187,15 +187,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, int);
+extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 82f026422424..e6bf45710cc7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -454,7 +454,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 }
 
 __be32 *
-nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus)
+nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
 	__be32 *p;
 	p = xdr_inline_decode(xdr, 4);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index dc98eb7976c3..31a44df40aea 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -590,7 +590,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 }
 
 __be32 *
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus)
+nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
 	__be32 *p;
 	struct nfs_entry old = *entry;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c58ea6377506..9fa496387fdf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -331,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cb33c73206e3..f5ab216e8870 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2832,6 +2832,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pgbase = 0,
 		.count = count,
 		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.plus = plus,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b7eff205d3d8..ccfb1c92b262 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1385,12 +1385,20 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-	uint32_t attrs[2] = {
-		FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
-		FATTR4_WORD1_MOUNTED_ON_FILEID,
-	};
+	uint32_t attrs[2] = {0, 0};
 	__be32 *p;
 
+	if (readdir->plus) {
+		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+			FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
+		attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+	}
+	attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
+	attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+
 	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
@@ -1398,11 +1406,15 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
 	*p++ = cpu_to_be32(2);
-	/* Switch to mounted_on_fileid if the server supports it */
-	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-		attrs[0] &= ~FATTR4_WORD0_FILEID;
-	else
-		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+
+	if (!readdir->plus) {
+		/* Switch to mounted_on_fileid if the server supports it */
+		if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+			attrs[0] &= ~FATTR4_WORD0_FILEID;
+		else
+			attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+	}
+
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
 	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
@@ -5768,7 +5780,8 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+			   struct nfs_server *server, int plus)
 {
 	uint32_t bitmap[2] = {0};
 	uint32_t len;
@@ -5824,24 +5837,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int
 		goto out_overflow;
 	len = XDR_QUADLEN(ntohl(*p++));	/* attribute buffer length */
 	if (len > 0) {
-		if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
-			bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
-			/* Ignore the return value of rdattr_error for now */
-			p = xdr_inline_decode(xdr, 4);
-			if (unlikely(!p))
-				goto out_overflow;
-		}
-		if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) {
-			p = xdr_inline_decode(xdr, 8);
-			if (unlikely(!p))
-				goto out_overflow;
-			xdr_decode_hyper(p, &entry->ino);
-		} else if (bitmap[0] == FATTR4_WORD0_FILEID) {
-			p = xdr_inline_decode(xdr, 8);
-			if (unlikely(!p))
-				goto out_overflow;
-			xdr_decode_hyper(p, &entry->ino);
-		}
+		if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+			goto out_overflow;
+		if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+			entry->ino = entry->fattr->fileid;
 	}
 
 	p = xdr_inline_peek(xdr, 8);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 1b9a17a1f235..efe2eab8ac94 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -778,6 +778,7 @@ struct nfs4_readdir_arg {
 	struct page **			pages;	/* zero-copy data */
 	unsigned int			pgbase;	/* zero-copy data */
 	const u32 *			bitmask;
+	int				plus;
 	struct nfs4_sequence_args	seq_args;
 };
 
@@ -1036,7 +1037,7 @@ struct nfs_rpc_ops {
 	int	(*pathconf) (struct nfs_server *, struct nfs_fh *,
 			     struct nfs_pathconf *);
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
-	__be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int plus);
+	__be32 *(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int plus);
 	void	(*read_setup)   (struct nfs_read_data *, struct rpc_message *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
-- 
cgit v1.2.3


From 4a201d6e3f4253f918555cc7c27c418f8ac1bb65 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Oct 2010 14:53:23 -0400
Subject: NFS: Ensure we check all allocation return values in new readdir code

Also some clean ups.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 61 ++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2a768d05a534..257e4052492e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -225,33 +225,42 @@ int nfs_readdir_clear_array(struct page *page, gfp_t mask)
  * nfs_clear_readdir_array()
  */
 static
-void nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
 {
 	string->len = len;
 	string->name = kmemdup(name, len, GFP_KERNEL);
-	string->hash = full_name_hash(string->name, string->len);
+	if (string->name == NULL)
+		return -ENOMEM;
+	string->hash = full_name_hash(name, len);
+	return 0;
 }
 
 static
 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 {
 	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	struct nfs_cache_array_entry *cache_entry;
+	int ret;
+
 	if (IS_ERR(array))
 		return PTR_ERR(array);
-	if (array->size >= MAX_READDIR_ARRAY) {
-		nfs_readdir_release_array(page);
-		return -EIO;
-	}
+	ret = -EIO;
+	if (array->size >= MAX_READDIR_ARRAY)
+		goto out;
 
-	array->array[array->size].cookie = entry->prev_cookie;
+	cache_entry = &array->array[array->size];
+	cache_entry->cookie = entry->prev_cookie;
+	cache_entry->ino = entry->ino;
+	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+	if (ret)
+		goto out;
 	array->last_cookie = entry->cookie;
-	array->array[array->size].ino = entry->ino;
-	nfs_readdir_make_qstr(&array->array[array->size].string, entry->name, entry->len);
 	if (entry->eof == 1)
 		array->eof_index = array->size;
 	array->size++;
+out:
 	nfs_readdir_release_array(page);
-	return 0;
+	return ret;
 }
 
 static
@@ -388,21 +397,24 @@ different:
 static
 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 {
-	struct qstr filename;
-	struct dentry *dentry = NULL;
-	struct dentry *alias = NULL;
+	struct qstr filename = {
+		.len = entry->len,
+		.name = entry->name,
+	};
+	struct dentry *dentry;
+	struct dentry *alias;
 	struct inode *dir = parent->d_inode;
 	struct inode *inode;
 
-	nfs_readdir_make_qstr(&filename, entry->name, entry->len);
-	if (filename.len == 1 && filename.name[0] == '.')
-		dentry = dget(parent);
-	else if (filename.len == 2 && filename.name[0] == '.'
-				   && filename.name[1] == '.')
-		dentry = dget_parent(parent);
-	else
-		dentry = d_lookup(parent, &filename);
+	if (filename.name[0] == '.') {
+		if (filename.len == 1)
+			return;
+		if (filename.len == 2 && filename.name[1] == '.')
+			return;
+	}
+	filename.hash = full_name_hash(filename.name, filename.len);
 
+	dentry = d_lookup(parent, &filename);
 	if (dentry != NULL) {
 		if (nfs_same_file(dentry, entry)) {
 			nfs_refresh_inode(dentry->d_inode, entry->fattr);
@@ -414,6 +426,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	}
 
 	dentry = d_alloc(parent, &filename);
+	if (dentry == NULL)
+		return;
+
 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
 	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
 	if (IS_ERR(inode))
@@ -430,8 +445,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 
 out:
 	dput(dentry);
-	kfree(filename.name);
-	return;
 }
 
 /* Perform conversion from xdr to cache array */
@@ -508,7 +521,7 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
 		pages[i] = page;
 	}
 
-	ptr = vm_map_ram(pages, NFS_MAX_READDIR_PAGES, 0, PAGE_KERNEL);
+	ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
 	if (!IS_ERR_OR_NULL(ptr))
 		return ptr;
 out_freepages:
-- 
cgit v1.2.3


From 7ad07353003d6ff69fe0b987813bb77b4d5ac23d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Oct 2010 15:34:20 -0400
Subject: NFSv4: Fix up decode_attr_filehandle() to handle the case of empty fh
 pointer

decode_attr_filehandle still needs to skip the XDR-encoded filehandle if
someone passes a null pointer argument.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ccfb1c92b262..a6b00e84bd1a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2883,12 +2883,8 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
 	__be32 *p;
 	int len;
 
-	if (fh == NULL) {
-		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
-		return 0;
-	}
-
-	memset(fh, 0, sizeof(*fh));
+	if (fh != NULL)
+		memset(fh, 0, sizeof(*fh));
 
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
 		return -EIO;
@@ -2899,11 +2895,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
 		len = be32_to_cpup(p);
 		if (len > NFS4_FHSIZE)
 			return -EIO;
-		fh->size = len;
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
-		memcpy(fh->data, p, len);
+		if (fh != NULL) {
+			memcpy(fh->data, p, len);
+			fh->size = len;
+		}
 		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
 	}
 	return 0;
-- 
cgit v1.2.3


From 3201f3dd7370f2d29dfb689ae16f8f5d4066cc33 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Oct 2010 15:43:10 -0400
Subject: NFSv4: Fix a regression in decode_getfattr

We don't want to have the mounted_on_fileid overwrite the true fileid. We
only return the former if the server didn't supply the true fileid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a6b00e84bd1a..707975eebb5d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3912,7 +3912,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
 	if (status < 0)
 		goto xdr_error;
-	if (status != 0) {
+	if (status != 0 && !(fattr->valid & status)) {
 		fattr->fileid = fileid;
 		fattr->valid |= status;
 	}
-- 
cgit v1.2.3


From 4f082222fad3c8471abe0c8e8f18c72f335a34c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 24 Oct 2010 13:14:02 -0400
Subject: NFSv4: nfs4_decode_dirent must clear entry->fattr->valid

Otherwise, we may end up reading uninitialised data from the resulting
struct nfs_fattr.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 707975eebb5d..9bf5e66d11db 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5815,6 +5815,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	 * since glibc seems to choke on it...)
 	 */
 	entry->ino = 1;
+	entry->fattr->valid = 0;
 
 	len = ntohl(*p++);		/* bitmap length */
 	if (len-- > 0) {
-- 
cgit v1.2.3


From 9af8c222ca5eae88f000664f693316480bf58fbc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 24 Oct 2010 11:52:55 -0400
Subject: NFSv4: Clean up nfs4_decode_dirent

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9bf5e66d11db..4c43e4874c52 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5803,11 +5803,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	p = xdr_decode_hyper(p, &entry->cookie);
 	entry->len = ntohl(*p++);
 
-	p = xdr_inline_decode(xdr, entry->len + 4);
+	p = xdr_inline_decode(xdr, entry->len);
 	if (unlikely(!p))
 		goto out_overflow;
 	entry->name = (const char *) p;
-	p += XDR_QUADLEN(entry->len);
 
 	/*
 	 * In case the server doesn't return an inode number,
@@ -5817,30 +5816,19 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	entry->ino = 1;
 	entry->fattr->valid = 0;
 
-	len = ntohl(*p++);		/* bitmap length */
-	if (len-- > 0) {
-		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
-			goto out_overflow;
-		bitmap[0] = ntohl(*p++);
-		if (len-- > 0) {
-			p = xdr_inline_decode(xdr, 4);
-			if (unlikely(!p))
-				goto out_overflow;
-			bitmap[1] = ntohl(*p++);
-			p += len;
-		}
-	}
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
+	if (decode_attr_bitmap(xdr, bitmap) < 0)
+		goto out_overflow;
+
+	if (decode_attr_length(xdr, &len, &p) < 0)
+		goto out_overflow;
+
+	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+		goto out_overflow;
+	if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+		entry->ino = entry->fattr->fileid;
+
+	if (verify_attr_len(xdr, p, len) < 0)
 		goto out_overflow;
-	len = XDR_QUADLEN(ntohl(*p++));	/* attribute buffer length */
-	if (len > 0) {
-		if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
-			goto out_overflow;
-		if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
-			entry->ino = entry->fattr->fileid;
-	}
 
 	p = xdr_inline_peek(xdr, 8);
 	if (p != NULL)
-- 
cgit v1.2.3


From 6f7a35bd23bdbbb40c07ee1120ef047190e77d9b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 24 Oct 2010 12:11:42 -0400
Subject: NFSv4: Fix up the 'dircount' hint in encode_readdir

Also ensure we only ask for either fileid or mounted_on_fileid in the
readdirplus case too...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4c43e4874c52..1e5d68e36e07 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1386,6 +1386,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
 	uint32_t attrs[2] = {0, 0};
+	uint32_t dircount = readdir->count >> 1;
 	__be32 *p;
 
 	if (readdir->plus) {
@@ -1395,26 +1396,24 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
 			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
 			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		dircount >>= 1;
 	}
 	attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
 	attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+	/* Switch to mounted_on_fileid if the server supports it */
+	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+		attrs[0] &= ~FATTR4_WORD0_FILEID;
+	else
+		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 
 	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
 	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
-	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+	*p++ = cpu_to_be32(dircount);
 	*p++ = cpu_to_be32(readdir->count);
 	*p++ = cpu_to_be32(2);
 
-	if (!readdir->plus) {
-		/* Switch to mounted_on_fileid if the server supports it */
-		if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-			attrs[0] &= ~FATTR4_WORD0_FILEID;
-		else
-			attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-	}
-
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
 	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
-- 
cgit v1.2.3


From f253b86b4ad1b3220544e75880510fd455ebd23f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sun, 24 Oct 2010 22:06:02 +0200
Subject: Revert "block: fix accounting bug on cross partition merges"

This reverts commit 7681bfeeccff5efa9eb29bf09249a3c400b15327.

Conflicts:

	include/linux/genhd.h

It has numerous issues with the cleanup path and non-elevator
devices. Revert it for now so we can come up with a clean
version without rushing things.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         | 24 ++++++++----------------
 block/blk-merge.c        |  2 +-
 block/blk.h              |  4 ++++
 block/genhd.c            | 14 --------------
 fs/partitions/check.c    | 12 ------------
 include/linux/blkdev.h   |  1 -
 include/linux/elevator.h |  2 --
 include/linux/genhd.h    |  1 -
 8 files changed, 13 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/block/blk-core.c b/block/blk-core.c
index 51efd835d4cf..f8548876d7ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,15 +64,13 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
+	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io) {
-		part = rq->part;
+	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
-	} else {
-		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
-		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -130,7 +128,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
-	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -805,16 +802,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv) {
+	if (priv)
 		rl->elvpriv++;
 
-		/*
-		 * Don't do stats for non-priv requests
-		 */
-		if (blk_queue_io_stat(q))
-			rw_flags |= REQ_IO_STAT;
-	}
-
+	if (blk_queue_io_stat(q))
+		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1791,7 +1783,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1811,7 +1803,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0a2fd8a48a38..77b7c26df6b5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 1e675e5ade02..2db8f32838e7 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -116,6 +116,10 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
+
+
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index a8adf96a4b41..5fa2b44a72ff 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -929,15 +929,8 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
-	struct gendisk *disk = ptbl->disk;
-	struct request_queue *q = disk->queue;
-	unsigned long flags;
 
 	kfree(ptbl);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	elv_quiesce_end(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -955,17 +948,11 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
-	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
-
-		spin_lock_irq(q->queue_lock);
-		elv_quiesce_start(q);
-		spin_unlock_irq(q->queue_lock);
-
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -1006,7 +993,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
-	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b81bfc016a05..0a8b0ad0c7e2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,25 +365,17 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-	struct gendisk *disk = part_to_disk(part);
-	struct request_queue *q = disk->queue;
-	unsigned long flags;
 
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	elv_quiesce_end(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
-	struct request_queue *q = disk->queue;
 
 	if (partno >= ptbl->len)
 		return;
@@ -398,10 +390,6 @@ void delete_partition(struct gendisk *disk, int partno)
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-	spin_lock_irq(q->queue_lock);
-	elv_quiesce_start(q);
-	spin_unlock_irq(q->queue_lock);
-
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 009b80e49f53..646b462d04df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,7 +115,6 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
-	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 80a0ece8f7e4..4fd978e7eb83 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -122,8 +122,6 @@ extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
-extern void elv_quiesce_start(struct request_queue *);
-extern void elv_quiesce_end(struct request_queue *);
 
 /*
  * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 557c3927e70f..7a7b9c1644e4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,7 +140,6 @@ struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
 	struct hd_struct __rcu *last_lookup;
-	struct gendisk *disk;
 	struct hd_struct __rcu *part[];
 };
 
-- 
cgit v1.2.3


From 6b96724e507fecc3e6440e86426fe4f44359ed66 Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Tue, 12 Oct 2010 16:30:05 -0700
Subject: Revalidate caches on lock

Instead of blindly zapping the caches, attempt to revalidate them if
the server has indicated that it uses high resolution timestamps.

NFSv4 should be able to always revalidate the cache since the
protocol requires the update of the change attribute on modification of
the data.  In reality, there are servers (the Linux NFS server
for example) that do not obey this requirement and use ctime as the
basis for change attribute.  Long term, the server needs to be fixed.
At this time, and to be on the safe side, continue zapping caches if
the server indicates that it does not have a high resolution timestamp.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  2 ++
 fs/nfs/file.c             | 19 ++++++++++++++++---
 fs/nfs/nfs3xdr.c          |  3 ++-
 include/linux/nfs_fs_sb.h |  1 +
 include/linux/nfs_xdr.h   |  1 +
 5 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index da2f2f024a4d..a63bce8d0596 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -915,6 +915,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 
 	server->maxfilesize = fsinfo->maxfilesize;
 
+	server->time_delta = fsinfo->time_delta;
+
 	/* We're airborne Set socket buffersize */
 	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 39672b731736..c3f2477c16c1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -757,6 +757,11 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	return status;
 }
 
+static int
+is_time_granular(struct timespec *ts) {
+	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+
 static int
 do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@@ -781,13 +786,21 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 		status = do_vfs_lock(filp, fl);
 	if (status < 0)
 		goto out;
+
 	/*
-	 * Make sure we clear the cache whenever we try to get the lock.
+	 * Revalidate the cache if the server has time stamps granular
+	 * enough to detect subsecond changes.  Otherwise, clear the
+	 * cache to prevent missing any changes.
+	 *
 	 * This makes locking act as a cache coherency point.
 	 */
 	nfs_sync_mapping(filp->f_mapping);
-	if (!nfs_have_delegation(inode, FMODE_READ))
-		nfs_zap_caches(inode);
+	if (!nfs_have_delegation(inode, FMODE_READ)) {
+		if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+			__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		else
+			nfs_zap_caches(inode);
+	}
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 31a44df40aea..d9a5e832c257 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1044,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
 	res->wtmult = ntohl(*p++);
 	res->dtpref = ntohl(*p++);
 	p = xdr_decode_hyper(p, &res->maxfilesize);
+	p = xdr_decode_time3(p, &res->time_delta);
 
-	/* ignore time_delta and properties */
+	/* ignore properties */
 	res->lease_time = 0;
 	return 0;
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index c82ee7cd6288..5eef862ec187 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -124,6 +124,7 @@ struct nfs_server {
 
 	struct nfs_fsid		fsid;
 	__u64			maxfilesize;	/* maximum file size */
+	struct timespec		time_delta;	/* smallest time granularity */
 	unsigned long		mount_time;	/* when this fs was mounted */
 	dev_t			s_dev;		/* superblock dev numbers */
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index efe2eab8ac94..da7a1300dc60 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -112,6 +112,7 @@ struct nfs_fsinfo {
 	__u32			wtmult;	/* writes should be multiple of this */
 	__u32			dtpref;	/* pref. readdir transfer size */
 	__u64			maxfilesize;
+	struct timespec		time_delta; /* server time granularity */
 	__u32			lease_time; /* in seconds */
 };
 
-- 
cgit v1.2.3


From 55b6e7742d5b25182edf410369379b9727b2e5bc Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Tue, 12 Oct 2010 16:30:06 -0700
Subject: Ask for time_delta during fsinfo probe

Used by the client to determine if the server has a granular enough
time stamp.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c |  2 +-
 fs/nfs/nfs4xdr.c  | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f5ab216e8870..e87fe612ca18 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -129,7 +129,7 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
 			| FATTR4_WORD0_MAXREAD
 			| FATTR4_WORD0_MAXWRITE
 			| FATTR4_WORD0_LEASE_TIME,
-			0
+			FATTR4_WORD1_TIME_DELTA
 };
 
 const u32 nfs4_fs_locations_bitmap[2] = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1e5d68e36e07..7131c761d85c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3582,6 +3582,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 	return status;
 }
 
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+				  struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+		status = decode_attr_time(xdr, time);
+		bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+	}
+	dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+		(long)time->tv_nsec);
+	return status;
+}
+
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
 	int status = 0;
@@ -3982,6 +4000,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 	if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
 		goto xdr_error;
 	fsinfo->wtpref = fsinfo->wtmax;
+	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+	if (status != 0)
+		goto xdr_error;
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-- 
cgit v1.2.3


From 3388bff5cfe91589a912cdc7f00d3aae3aa18adc Mon Sep 17 00:00:00 2001
From: Roman Borisov <ext-roman.borisov@nokia.com>
Date: Wed, 13 Oct 2010 16:54:51 +0400
Subject: nfs: fix unchecked value

Return value of "decode_attr_bitmap()" was not checked;

Signed-off-by: Roman Borisov <ext-roman.borisov@nokia.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7131c761d85c..bd2101d918c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2687,7 +2687,10 @@ out_overflow:
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
 {
 	if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
-		decode_attr_bitmap(xdr, bitmask);
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
 		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
 	} else
 		bitmask[0] = bitmask[1] = 0;
-- 
cgit v1.2.3


From 3c9101a0575f6078191dae6bb91a834c6aac3557 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 20 Oct 2010 00:17:53 -0400
Subject: NFSD: remove duplicate NFS4_STATEID_SIZE

Already accepted by Bruce

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4callback.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..014482c4e57d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 
 /* Index of predefined Linux callback client operations */
 
-- 
cgit v1.2.3


From 9449925273933d19235d7d36c1fd970841d055de Mon Sep 17 00:00:00 2001
From: Alexandros Batsakis <batsakis@netapp.com>
Date: Wed, 20 Oct 2010 00:17:56 -0400
Subject: NFS: change stateid to be a union

In NFSv4.1 the stateid consists of the other and seqid fields. For layout
processing we need to numerically compare the seqid value of layout stateids.
To do so, introduce a union to nfs4_stateid to switch between opaque(16 bytes)
and opaque(12 bytes) / __be32

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c |  8 ++++----
 include/linux/nfs4.h   | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..2950fca0c61b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
 	if (delegation == NULL)
 		return 0;
 
-	/* seqid is 4-bytes long */
-	if (((u32 *) &stateid->data)[0] != 0)
+	if (stateid->stateid.seqid != 0)
 		return 0;
-	if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
-		   sizeof(stateid->data)-4))
+	if (memcmp(&delegation->stateid.stateid.other,
+		   &stateid->stateid.other,
+		   NFS4_STATEID_OTHER_SIZE))
 		return 0;
 
 	return 1;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 6c0406e87d5c..34da32436ac0 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -17,7 +17,9 @@
 
 #define NFS4_BITMAP_SIZE	2
 #define NFS4_VERIFIER_SIZE	8
-#define NFS4_STATEID_SIZE	16
+#define NFS4_STATEID_SEQID_SIZE 4
+#define NFS4_STATEID_OTHER_SIZE 12
+#define NFS4_STATEID_SIZE	(NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
 #define NFS4_FHSIZE		128
 #define NFS4_MAXPATHLEN		PATH_MAX
 #define NFS4_MAXNAMLEN		NAME_MAX
@@ -167,7 +169,16 @@ struct nfs4_acl {
 };
 
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid;
+
+struct nfs41_stateid {
+	__be32 seqid;
+	char other[NFS4_STATEID_OTHER_SIZE];
+} __attribute__ ((packed));
+
+typedef union {
+	char data[NFS4_STATEID_SIZE];
+	struct nfs41_stateid stateid;
+} nfs4_stateid;
 
 enum nfs_opnum4 {
 	OP_ACCESS = 3,
-- 
cgit v1.2.3


From 504913fbc84c00bba7224d73e4aab525c1731f7d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 20 Oct 2010 00:17:57 -0400
Subject: NFS: ask for layouttypes during v4 fsinfo call

This information will be used to determine which layout driver,
if any, to use for subsequent IO on this filesystem.  Each driver
is assigned an integer id, with 0 reserved to indicate no driver.

The server can in theory return multiple ids.  However, our current
client implementation only notes the first entry and ignores the
rest.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       |  1 +
 fs/nfs/nfs4xdr.c        | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 60 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e87fe612ca18..a5f1edb45b47 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -130,6 +130,7 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
 			| FATTR4_WORD0_MAXWRITE
 			| FATTR4_WORD0_LEASE_TIME,
 			FATTR4_WORD1_TIME_DELTA
+			| FATTR4_WORD1_FS_LAYOUT_TYPES
 };
 
 const u32 nfs4_fs_locations_bitmap[2] = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index bd2101d918c8..8b4dfa393f0f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3978,6 +3978,61 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
 }
 
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+					 uint32_t *layouttype)
+{
+	uint32_t *p;
+	int num;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	num = be32_to_cpup(p);
+
+	/* pNFS is not supported by the underlying file system */
+	if (num == 0) {
+		*layouttype = 0;
+		return 0;
+	}
+	if (num > 1)
+		printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+			"per filesystem not supported\n", __func__);
+
+	/* Decode and set first layout type, move xdr->p past unused types */
+	p = xdr_inline_decode(xdr, num * 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*layouttype = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+				uint32_t *layouttype)
+{
+	int status = 0;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+		return -EIO;
+	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+		status = decode_first_pnfs_layout_type(xdr, layouttype);
+		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+	} else
+		*layouttype = 0;
+	return status;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
 	__be32 *savep;
@@ -4004,6 +4059,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 		goto xdr_error;
 	fsinfo->wtpref = fsinfo->wtmax;
 	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+	if (status != 0)
+		goto xdr_error;
+	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
 	if (status != 0)
 		goto xdr_error;
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index da7a1300dc60..065f9d105d05 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -114,6 +114,7 @@ struct nfs_fsinfo {
 	__u64			maxfilesize;
 	struct timespec		time_delta; /* server time granularity */
 	__u32			lease_time; /* in seconds */
+	__u32			layouttype; /* supported pnfs layout driver */
 };
 
 struct nfs_fsstat {
-- 
cgit v1.2.3


From 85e174ba6b786ad336eb2df105b4f66d0932e70a Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 20 Oct 2010 00:17:58 -0400
Subject: NFS: set layout driver

Put in the infrastructure that uses information returned from the
server at mount to select a layout driver module.

In this patch, a stub is used that always returns "no driver found".

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Dean Hildebrand <dhildebz@umich.edu>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile           |  1 +
 fs/nfs/client.c           |  5 +++
 fs/nfs/pnfs.c             | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/pnfs.h             | 56 +++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h    |  1 +
 include/linux/nfs_fs_sb.h |  1 +
 6 files changed, 148 insertions(+)
 create mode 100644 fs/nfs/pnfs.c
 create mode 100644 fs/nfs/pnfs.h

(limited to 'fs')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..bb9e773d4312 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,6 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a63bce8d0596..eba0bcc1bab0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
@@ -900,6 +901,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
 		server->wsize = NFS_MAX_FILE_IO_SIZE;
 	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	set_pnfs_layoutdriver(server, fsinfo->layouttype);
+
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
@@ -939,6 +942,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
 	}
 
 	fsinfo.fattr = fattr;
+	fsinfo.layouttype = 0;
 	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
 	if (error < 0)
 		goto out_error;
@@ -1021,6 +1025,7 @@ void nfs_free_server(struct nfs_server *server)
 {
 	dprintk("--> nfs_free_server()\n");
 
+	unset_pnfs_layoutdriver(server);
 	spin_lock(&nfs_client_lock);
 	list_del(&server->client_link);
 	list_del(&server->master_link);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..b483026e82aa
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,84 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+/* STUB that returns the equivalent of "no module found" */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+	return NULL;
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+	nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+	struct pnfs_layoutdriver_type *ld_type = NULL;
+
+	if (id == 0)
+		goto out_no_driver;
+	if (!(server->nfs_client->cl_exchange_flags &
+		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+		printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+		       id, server->nfs_client->cl_exchange_flags);
+		goto out_no_driver;
+	}
+	ld_type = find_pnfs_driver(id);
+	if (!ld_type) {
+		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+		ld_type = find_pnfs_driver(id);
+		if (!ld_type) {
+			dprintk("%s: No pNFS module found for %u.\n",
+				__func__, id);
+			goto out_no_driver;
+		}
+	}
+	server->pnfs_curr_ld = ld_type;
+	dprintk("%s: pNFS module for %u set\n", __func__, id);
+	return;
+
+out_no_driver:
+	dprintk("%s: Using NFSv4 I/O\n", __func__);
+	server->pnfs_curr_ld = NULL;
+}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..c628ef131d83
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,56 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+};
+
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+
+#else  /* CONFIG_NFS_V4_1 */
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* FS_NFS_PNFS_H */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d929b1883644..aba3da2a6227 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -615,6 +615,7 @@ nfs_fileid_to_ino_t(u64 fileid)
 #define NFSDBG_CLIENT		0x0200
 #define NFSDBG_MOUNT		0x0400
 #define NFSDBG_FSCACHE		0x0800
+#define NFSDBG_PNFS		0x1000
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 5eef862ec187..c38619d95a57 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -145,6 +145,7 @@ struct nfs_server {
 	u32			acl_bitmask;	/* V4 bitmask representing the ACEs
 						   that are supported on this
 						   filesystem */
+	struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
 #endif
 	void (*destroy)(struct nfs_server *);
 
-- 
cgit v1.2.3


From 02c35fca7cf4ea2dfdc6db279e230cacbbf4b870 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Wed, 20 Oct 2010 00:17:59 -0400
Subject: NFSv4.1: pnfs: full mount/umount infrastructure

Allow a module implementing a layout type to register, and
have its mount/umount routines called for filesystems that
the server declares support it.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Bian Naimeng <biannm@cn.fujitsu.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs/00-INDEX |  2 +
 Documentation/filesystems/nfs/pnfs.txt | 48 +++++++++++++++++++
 fs/nfs/Kconfig                         |  8 +++-
 fs/nfs/pnfs.c                          | 88 +++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h                          |  9 ++++
 5 files changed, 151 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/filesystems/nfs/pnfs.txt

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 3225a5662114..a57e12411d2a 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,6 +12,8 @@ nfs-rdma.txt
 	- how to install and setup the Linux NFS/RDMA client and server software
 nfsroot.txt
 	- short guide on setting up a diskless box with NFS root filesystem.
+pnfs.txt
+	- short explanation of some of the internals of the pnfs client code
 rpc-cache.txt
 	- introduction to the caching mechanisms in the sunrpc layer.
 idmapper.txt
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
new file mode 100644
index 000000000000..bc0b9cfe095b
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -0,0 +1,48 @@
+Reference counting in pnfs:
+==========================
+
+The are several inter-related caches.  We have layouts which can
+reference multiple devices, each of which can reference multiple data servers.
+Each data server can be referenced by multiple devices.  Each device
+can be referenced by multiple layouts.  To keep all of this straight,
+we need to reference count.
+
+
+struct pnfs_layout_hdr
+----------------------
+The on-the-wire command LAYOUTGET corresponds to struct
+pnfs_layout_segment, usually referred to by the variable name lseg.
+Each nfs_inode may hold a pointer to a cache of of these layout
+segments in nfsi->layout, of type struct pnfs_layout_hdr.
+
+We reference the header for the inode pointing to it, across each
+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
+LAYOUTCOMMIT), and for each lseg held within.
+
+Each header is also (when non-empty) put on a list associated with
+struct nfs_client (cl_layouts).  Being put on this list does not bump
+the reference count, as the layout is kept around by the lseg that
+keeps it in the list.
+
+deviceid_cache
+--------------
+lsegs reference device ids, which are resolved per nfs_client and
+layout driver type.  The device ids are held in a RCU cache (struct
+nfs4_deviceid_cache).  The cache itself is referenced across each
+mount.  The entries (struct nfs4_deviceid) themselves are held across
+the lifetime of each lseg referencing them.
+
+RCU is used because the deviceid is basically a write once, read many
+data structure.  The hlist size of 32 buckets needs better
+justification, but seems reasonable given that we can have multiple
+deviceid's per filesystem, and multiple filesystems per nfs_client.
+
+The hash code is copied from the nfsd code base.  A discussion of
+hashing and variations of this algorithm can be found at:
+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
+
+data server cache
+-----------------
+file driver devices refer to data servers, which are kept in a module
+level cache.  Its reference is held over the lifetime of the deviceid
+pointing to it.
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3f69752d6f18..d94311943ec6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -75,13 +75,17 @@ config NFS_V4
 
 config NFS_V4_1
 	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-	depends on NFS_V4 && EXPERIMENTAL
+	depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+	select PNFS_FILE_LAYOUT
 	help
 	  This option enables support for minor version 1 of the NFSv4 protocol
-	  (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+	  (RFC 5661) in the kernel's NFS client.
 
 	  If unsure, say N.
 
+config PNFS_FILE_LAYOUT
+	tristate
+
 config ROOT_NFS
 	bool "Root file system on NFS"
 	depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b483026e82aa..cf795625610e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -32,16 +32,51 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 
-/* STUB that returns the equivalent of "no module found" */
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+		if (local->id == id)
+			goto out;
+	local = NULL;
+out:
+	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+	return local;
+}
+
 static struct pnfs_layoutdriver_type *
 find_pnfs_driver(u32 id)
 {
-	return NULL;
+	struct pnfs_layoutdriver_type *local;
+
+	spin_lock(&pnfs_spinlock);
+	local = find_pnfs_driver_locked(id);
+	spin_unlock(&pnfs_spinlock);
+	return local;
 }
 
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
+	if (nfss->pnfs_curr_ld) {
+		nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss);
+		module_put(nfss->pnfs_curr_ld->owner);
+	}
 	nfss->pnfs_curr_ld = NULL;
 }
 
@@ -74,7 +109,18 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
 			goto out_no_driver;
 		}
 	}
+	if (!try_module_get(ld_type->owner)) {
+		dprintk("%s: Could not grab reference on module\n", __func__);
+		goto out_no_driver;
+	}
 	server->pnfs_curr_ld = ld_type;
+	if (ld_type->initialize_mountpoint(server)) {
+		printk(KERN_ERR
+		       "%s: Error initializing mount point for layout driver %u.\n",
+		       __func__, id);
+		module_put(ld_type->owner);
+		goto out_no_driver;
+	}
 	dprintk("%s: pNFS module for %u set\n", __func__, id);
 	return;
 
@@ -82,3 +128,41 @@ out_no_driver:
 	dprintk("%s: Using NFSv4 I/O\n", __func__);
 	server->pnfs_curr_ld = NULL;
 }
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	int status = -EINVAL;
+	struct pnfs_layoutdriver_type *tmp;
+
+	if (ld_type->id == 0) {
+		printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+		return status;
+	}
+
+	spin_lock(&pnfs_spinlock);
+	tmp = find_pnfs_driver_locked(ld_type->id);
+	if (!tmp) {
+		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+		status = 0;
+		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+			ld_type->name);
+	} else {
+		printk(KERN_ERR "%s Module with id %d already loaded!\n",
+			__func__, ld_type->id);
+	}
+	spin_unlock(&pnfs_spinlock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+	spin_lock(&pnfs_spinlock);
+	list_del(&ld_type->pnfs_tblid);
+	spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c628ef131d83..61531f338576 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,8 +36,17 @@
 
 /* Per-layout driver specific registration structure */
 struct pnfs_layoutdriver_type {
+	struct list_head pnfs_tblid;
+	const u32 id;
+	const char *name;
+	struct module *owner;
+	int (*initialize_mountpoint) (struct nfs_server *);
+	int (*uninitialize_mountpoint) (struct nfs_server *);
 };
 
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 
-- 
cgit v1.2.3


From 7ab672ce312133ee4a5d85b71447b2b334403681 Mon Sep 17 00:00:00 2001
From: Dean Hildebrand <dhildebz@umich.edu>
Date: Wed, 20 Oct 2010 00:18:00 -0400
Subject: NFSv4.1: pnfs: filelayout: introduce minimal file layout driver

This driver just registers itself and supplies trivial mount/umount functions.

Signed-off-by: Dean Hildebrand <dhildebz@umich.edu>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile         |  3 ++
 fs/nfs/nfs4filelayout.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h  |  1 +
 3 files changed, 82 insertions(+)
 create mode 100644 fs/nfs/nfs4filelayout.c

(limited to 'fs')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index bb9e773d4312..08a8889c5149 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -18,3 +18,6 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..696e283c2632
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,78 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+int
+filelayout_initialize_mountpoint(struct nfs_server *nfss)
+{
+	return 0;
+}
+
+int
+filelayout_uninitialize_mountpoint(struct nfs_server *nfss)
+{
+	dprintk("--> %s\n", __func__);
+
+	return 0;
+}
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+	.id = LAYOUT_NFSV4_1_FILES,
+	.name = "LAYOUT_NFSV4_1_FILES",
+	.owner = THIS_MODULE,
+	.initialize_mountpoint   = filelayout_initialize_mountpoint,
+	.uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+	       __func__);
+	return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+	       __func__);
+	pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index aba3da2a6227..499872fa895c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -616,6 +616,7 @@ nfs_fileid_to_ino_t(u64 fileid)
 #define NFSDBG_MOUNT		0x0400
 #define NFSDBG_FSCACHE		0x0800
 #define NFSDBG_PNFS		0x1000
+#define NFSDBG_PNFS_LD		0x2000
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3


From e5e940170b2136ad4d5483ef293ae284b9cc8d53 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 20 Oct 2010 00:18:01 -0400
Subject: NFS: create and destroy inode's layout cache

At the start of the io paths, try to grab the relevant layout
information.  This will initiate the inode's layout cache, but
stubs ensure the cache stays empty.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Dean Hildebrand <dhildebz@umich.edu>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          |   5 ++
 fs/nfs/inode.c         |   3 ++
 fs/nfs/pnfs.c          | 140 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/pnfs.h          |  39 ++++++++++++++
 fs/nfs/read.c          |   3 ++
 include/linux/nfs_fs.h |   3 ++
 6 files changed, 193 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c3f2477c16c1..91d019d39122 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
 
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+	pnfs_update_layout(mapping->host,
+			   nfs_file_open_context(file),
+			   IOMODE_RW);
+
 start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6eec28656415..314f57164602 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
 #include "internal.h"
 #include "fscache.h"
 #include "dns_resolve.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -1410,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
+	pnfs_destroy_layout(NFS_I(inode));
 	/* If we are holding a delegation, return it! */
 	nfs_inode_return_delegation_noreclaim(inode);
 	/* First call standard NFS clear_inode() code */
@@ -1447,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 	nfsi->delegation = NULL;
 	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
+	nfsi->layout = NULL;
 #endif
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cf795625610e..c0cd954855b9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -166,3 +166,143 @@ pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 	spin_unlock(&pnfs_spinlock);
 }
 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+static void
+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+	assert_spin_locked(&lo->inode->i_lock);
+	lo->refcount++;
+}
+
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+	assert_spin_locked(&lo->inode->i_lock);
+	BUG_ON(lo->refcount == 0);
+
+	lo->refcount--;
+	if (!lo->refcount) {
+		dprintk("%s: freeing layout cache %p\n", __func__, lo);
+		NFS_I(lo->inode)->layout = NULL;
+		kfree(lo);
+	}
+}
+
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+	struct pnfs_layout_hdr *lo;
+
+	spin_lock(&nfsi->vfs_inode.i_lock);
+	lo = nfsi->layout;
+	if (lo) {
+		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
+		put_layout_hdr_locked(lo);
+	}
+	spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+
+/* STUB - pretend LAYOUTGET to server failed */
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+	   struct nfs_open_context *ctx,
+	   u32 iomode)
+{
+	struct inode *ino = lo->inode;
+
+	set_bit(lo_fail_bit(iomode), &lo->state);
+	spin_lock(&ino->i_lock);
+	put_layout_hdr_locked(lo);
+	spin_unlock(&ino->i_lock);
+	return NULL;
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+
+	lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+	if (!lo)
+		return NULL;
+	lo->refcount = 1;
+	lo->inode = ino;
+	return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *new = NULL;
+
+	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+	assert_spin_locked(&ino->i_lock);
+	if (nfsi->layout)
+		return nfsi->layout;
+
+	spin_unlock(&ino->i_lock);
+	new = alloc_init_layout_hdr(ino);
+	spin_lock(&ino->i_lock);
+
+	if (likely(nfsi->layout == NULL))	/* Won the race? */
+		nfsi->layout = new;
+	else
+		kfree(new);
+	return nfsi->layout;
+}
+
+/* STUB - LAYOUTGET never succeeds, so cache is empty */
+static struct pnfs_layout_segment *
+pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+	return NULL;
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+		   struct nfs_open_context *ctx,
+		   enum pnfs_iomode iomode)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg = NULL;
+
+	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+		return NULL;
+	spin_lock(&ino->i_lock);
+	lo = pnfs_find_alloc_layout(ino);
+	if (lo == NULL) {
+		dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+		goto out_unlock;
+	}
+
+	/* Check to see if the layout for the given range already exists */
+	lseg = pnfs_has_layout(lo, iomode);
+	if (lseg) {
+		dprintk("%s: Using cached lseg %p for iomode %d)\n",
+			__func__, lseg, iomode);
+		goto out_unlock;
+	}
+
+	/* if LAYOUTGET already failed once we don't try again */
+	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+		goto out_unlock;
+
+	get_layout_hdr_locked(lo);
+	spin_unlock(&ino->i_lock);
+
+	lseg = send_layoutget(lo, ctx, iomode);
+out:
+	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+		nfsi->layout->state, lseg);
+	return lseg;
+out_unlock:
+	spin_unlock(&ino->i_lock);
+	goto out;
+}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 61531f338576..4ed1b48c71b1 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -34,6 +34,11 @@
 
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
 
+enum {
+	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
+	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
+};
+
 /* Per-layout driver specific registration structure */
 struct pnfs_layoutdriver_type {
 	struct list_head pnfs_tblid;
@@ -44,14 +49,48 @@ struct pnfs_layoutdriver_type {
 	int (*uninitialize_mountpoint) (struct nfs_server *);
 };
 
+struct pnfs_layout_hdr {
+	unsigned long		refcount;
+	unsigned long		state;
+	struct inode		*inode;
+};
+
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_destroy_layout(struct nfs_inode *);
+
+
+static inline int lo_fail_bit(u32 iomode)
+{
+	return iomode == IOMODE_RW ?
+			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+	return nfss->pnfs_curr_ld != NULL;
+}
 
 #else  /* CONFIG_NFS_V4_1 */
 
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+		   enum pnfs_iomode access_type)
+{
+	return NULL;
+}
+
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 79859c81a943..e4b62c6f5a6e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -120,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
+	pnfs_update_layout(inode, ctx, IOMODE_READ);
 	new = nfs_create_request(ctx, inode, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
@@ -624,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
+	pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 499872fa895c..0833bb67c831 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -188,6 +188,9 @@ struct nfs_inode {
 	struct nfs_delegation	*delegation;
 	fmode_t			 delegation_state;
 	struct rw_semaphore	rwsem;
+
+	/* pNFS layout information */
+	struct pnfs_layout_hdr *layout;
 #endif /* CONFIG_NFS_V4*/
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;
-- 
cgit v1.2.3


From 974cec8ca0352eb5d281535b714cf194a606e98f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 20 Oct 2010 00:18:02 -0400
Subject: NFS: client needs to maintain list of inodes with active layouts

In particular, server reboot will invalidate all layouts.

Note that in order to have an active layout, we must get a successful response
from the server.  To avoid adding that machinery, this patch just includes a
stub that fakes up a successful return.  Since the layout is never referenced
for io, this is not a problem.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Dean Hildebrand <dhildebz@umich.edu>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |   4 +-
 fs/nfs/nfs4state.c        |   2 +
 fs/nfs/pnfs.c             | 160 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h             |  14 ++++
 include/linux/nfs_fs_sb.h |   1 +
 5 files changed, 177 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index eba0bcc1bab0..e55ad211dbed 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -156,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	cred = rpc_lookup_machine_cred();
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
-
+#if defined(CONFIG_NFS_V4_1)
+	INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
 	nfs_fscache_get_client_cookie(clp);
 
 	return clp;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e0dc06d74b6a..9c81f4d66950 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -54,6 +54,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #define OPENOWNER_POOL_SIZE	8
 
@@ -1475,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			}
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+			pnfs_destroy_all_layouts(clp);
 		}
 
 		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c0cd954855b9..891a0c36f992 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -28,6 +28,7 @@
  */
 
 #include <linux/nfs_fs.h>
+#include "internal.h"
 #include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
@@ -183,38 +184,189 @@ put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 	lo->refcount--;
 	if (!lo->refcount) {
 		dprintk("%s: freeing layout cache %p\n", __func__, lo);
+		BUG_ON(!list_empty(&lo->layouts));
 		NFS_I(lo->inode)->layout = NULL;
 		kfree(lo);
 	}
 }
 
+static void
+put_layout_hdr(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	put_layout_hdr_locked(NFS_I(inode)->layout);
+	spin_unlock(&inode->i_lock);
+}
+
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+	INIT_LIST_HEAD(&lseg->fi_list);
+	kref_init(&lseg->kref);
+	lseg->layout = lo;
+}
+
+/* Called without i_lock held, as the free_lseg call may sleep */
+static void
+destroy_lseg(struct kref *kref)
+{
+	struct pnfs_layout_segment *lseg =
+		container_of(kref, struct pnfs_layout_segment, kref);
+	struct inode *ino = lseg->layout->inode;
+
+	dprintk("--> %s\n", __func__);
+	kfree(lseg);
+	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+	put_layout_hdr(ino);
+}
+
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+	if (!lseg)
+		return;
+
+	dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+		atomic_read(&lseg->kref.refcount));
+	kref_put(&lseg->kref, destroy_lseg);
+}
+
+static void
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+{
+	struct pnfs_layout_segment *lseg, *next;
+	struct nfs_client *clp;
+
+	dprintk("%s:Begin lo %p\n", __func__, lo);
+
+	assert_spin_locked(&lo->inode->i_lock);
+	list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+		dprintk("%s: freeing lseg %p\n", __func__, lseg);
+		list_move(&lseg->fi_list, tmp_list);
+	}
+	clp = NFS_SERVER(lo->inode)->nfs_client;
+	spin_lock(&clp->cl_lock);
+	/* List does not take a reference, so no need for put here */
+	list_del_init(&lo->layouts);
+	spin_unlock(&clp->cl_lock);
+
+	dprintk("%s:Return\n", __func__);
+}
+
+static void
+pnfs_free_lseg_list(struct list_head *tmp_list)
+{
+	struct pnfs_layout_segment *lseg;
+
+	while (!list_empty(tmp_list)) {
+		lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+				fi_list);
+		dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+		list_del(&lseg->fi_list);
+		put_lseg(lseg);
+	}
+}
+
 void
 pnfs_destroy_layout(struct nfs_inode *nfsi)
 {
 	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
 
 	spin_lock(&nfsi->vfs_inode.i_lock);
 	lo = nfsi->layout;
 	if (lo) {
+		pnfs_clear_lseg_list(lo, &tmp_list);
 		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
 		put_layout_hdr_locked(lo);
 	}
 	spin_unlock(&nfsi->vfs_inode.i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&clp->cl_lock);
+	list_splice_init(&clp->cl_layouts, &tmp_list);
+	spin_unlock(&clp->cl_lock);
+
+	while (!list_empty(&tmp_list)) {
+		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+				layouts);
+		dprintk("%s freeing layout for inode %lu\n", __func__,
+			lo->inode->i_ino);
+		pnfs_destroy_layout(NFS_I(lo->inode));
+	}
 }
 
-/* STUB - pretend LAYOUTGET to server failed */
+static void pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+			       struct pnfs_layout_segment *lseg);
+
+/* Get layout from server. */
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
 	   u32 iomode)
 {
 	struct inode *ino = lo->inode;
+	struct pnfs_layout_segment *lseg;
 
-	set_bit(lo_fail_bit(iomode), &lo->state);
+	/* Lets pretend we sent LAYOUTGET and got a response */
+	lseg = kzalloc(sizeof(*lseg), GFP_KERNEL);
+	if (!lseg) {
+		set_bit(lo_fail_bit(iomode), &lo->state);
+		spin_lock(&ino->i_lock);
+		put_layout_hdr_locked(lo);
+		spin_unlock(&ino->i_lock);
+		return NULL;
+	}
+	init_lseg(lo, lseg);
+	lseg->iomode = IOMODE_RW;
 	spin_lock(&ino->i_lock);
+	pnfs_insert_layout(lo, lseg);
 	put_layout_hdr_locked(lo);
 	spin_unlock(&ino->i_lock);
-	return NULL;
+	return lseg;
+}
+
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg)
+{
+	dprintk("%s:Begin\n", __func__);
+
+	assert_spin_locked(&lo->inode->i_lock);
+	if (list_empty(&lo->segs)) {
+		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+
+		spin_lock(&clp->cl_lock);
+		BUG_ON(!list_empty(&lo->layouts));
+		list_add_tail(&lo->layouts, &clp->cl_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+	get_layout_hdr_locked(lo);
+	/* STUB - add the constructed lseg if necessary */
+	if (list_empty(&lo->segs)) {
+		list_add_tail(&lseg->fi_list, &lo->segs);
+		dprintk("%s: inserted lseg %p iomode %d at tail\n",
+			__func__, lseg, lseg->iomode);
+	} else {
+		/* There is no harm for the moment in calling this
+		 * with the lock held, and the call will be removed
+		 * with the STUB.
+		 */
+		put_lseg(lseg);
+	}
+
+	dprintk("%s:Return\n", __func__);
 }
 
 static struct pnfs_layout_hdr *
@@ -226,6 +378,8 @@ alloc_init_layout_hdr(struct inode *ino)
 	if (!lo)
 		return NULL;
 	lo->refcount = 1;
+	INIT_LIST_HEAD(&lo->layouts);
+	INIT_LIST_HEAD(&lo->segs);
 	lo->inode = ino;
 	return lo;
 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4ed1b48c71b1..1c3eb02f4944 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,13 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
+struct pnfs_layout_segment {
+	struct list_head fi_list;
+	u32 iomode;
+	struct kref kref;
+	struct pnfs_layout_hdr *layout;
+};
+
 #ifdef CONFIG_NFS_V4_1
 
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -51,6 +58,8 @@ struct pnfs_layoutdriver_type {
 
 struct pnfs_layout_hdr {
 	unsigned long		refcount;
+	struct list_head	layouts;   /* other client layouts */
+	struct list_head	segs;      /* layout segments list */
 	unsigned long		state;
 	struct inode		*inode;
 };
@@ -64,6 +73,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
 
 
 static inline int lo_fail_bit(u32 iomode)
@@ -80,6 +90,10 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 
 #else  /* CONFIG_NFS_V4_1 */
 
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
 static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 {
 }
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index c38619d95a57..4d62f1581ed1 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -82,6 +82,7 @@ struct nfs_client {
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
+	struct list_head	cl_layouts;
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
cgit v1.2.3


From b1f69b754ee312ec75f2c7ead0e6851cd9598cc2 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 20 Oct 2010 00:18:03 -0400
Subject: NFSv4.1: pnfs: add LAYOUTGET and GETDEVICEINFO infrastructure

Add the ability to actually send LAYOUTGET and GETDEVICEINFO.  This also adds
in the machinery to handle layout state and the deviceid cache.  Note that
GETDEVICEINFO is not called directly by the generic layer.  Instead it
is called by the drivers while parsing the LAYOUTGET opaque data in response
to an unknown device id embedded therein.  RFC 5661 only encodes
device ids within the driver-specific opaque data.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Dean Hildebrand <dhildebz@umich.edu>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c         | 142 +++++++++++++++++
 fs/nfs/nfs4xdr.c          | 302 ++++++++++++++++++++++++++++++++++++
 fs/nfs/pnfs.c             | 385 ++++++++++++++++++++++++++++++++++++++++++----
 fs/nfs/pnfs.h             |  73 ++++++++-
 include/linux/nfs4.h      |   2 +
 include/linux/nfs_fs_sb.h |   1 +
 include/linux/nfs_xdr.h   |  49 ++++++
 7 files changed, 921 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a5f1edb45b47..7e14e991ddfa 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "callback.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -5256,6 +5257,147 @@ out:
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
 }
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct inode *ino = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(ino);
+
+	dprintk("--> %s\n", __func__);
+	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+				&lgp->res.seq_res, 0, task))
+		return;
+	rpc_call_start(task);
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LAYOUTTRYLATER:
+	case -NFS4ERR_RECALLCONFLICT:
+		task->tk_status = -NFS4ERR_DELAY;
+		/* Fall through */
+	default:
+		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+	lgp->status = task->tk_status;
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	put_layout_hdr(lgp->args.inode);
+	if (lgp->res.layout.buf != NULL)
+		free_page((unsigned long) lgp->res.layout.buf);
+	put_nfs_open_context(lgp->args.ctx);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+	.rpc_call_prepare = nfs4_layoutget_prepare,
+	.rpc_call_done = nfs4_layoutget_done,
+	.rpc_release = nfs4_layoutget_release,
+};
+
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+		.rpc_argp = &lgp->args,
+		.rpc_resp = &lgp->res,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutget_call_ops,
+		.callback_data = lgp,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+	if (lgp->res.layout.buf == NULL) {
+		nfs4_layoutget_release(lgp);
+		return -ENOMEM;
+	}
+
+	lgp->res.seq_res.sr_slot = NULL;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0)
+		goto out;
+	status = lgp->status;
+	if (status != 0)
+		goto out;
+	status = pnfs_layout_process(lgp);
+out:
+	rpc_put_task(task);
+	dprintk("<-- %s status=%d\n", __func__, status);
+	return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+	struct nfs4_getdeviceinfo_args args = {
+		.pdev = pdev,
+	};
+	struct nfs4_getdeviceinfo_res res = {
+		.pdev = pdev,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
+	dprintk("<-- %s status=%d\n", __func__, status);
+
+	return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+					_nfs4_proc_getdeviceinfo(server, pdev),
+					&exception);
+	} while (exception.retry);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8b4dfa393f0f..f313c4cce7e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+				1 /* layout type */ + \
+				1 /* opaque devaddr4 length */ + \
+				  /* devaddr4 payload is read into page */ \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap */)
+#define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
+				encode_stateid_maxsz)
+#define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
+				decode_stateid_maxsz + \
+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
 					 decode_sequence_maxsz + \
 					 decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+				encode_sequence_maxsz +\
+				encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+				decode_sequence_maxsz + \
+				decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz +        \
+				encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz +        \
+				decode_layoutget_maxsz)
 
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
 				      compound_encode_hdr_maxsz +
@@ -1737,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+		     const struct nfs4_getdeviceinfo_args *args,
+		     struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(OP_GETDEVICEINFO);
+	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+				    NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(args->pdev->layout_type);
+	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */
+	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
+	hdr->nops++;
+	hdr->replen += decode_getdeviceinfo_maxsz;
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+		      const struct nfs4_layoutget_args *args,
+		      struct compound_hdr *hdr)
+{
+	nfs4_stateid stateid;
+	__be32 *p;
+
+	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_LAYOUTGET);
+	*p++ = cpu_to_be32(0);     /* Signal layout available */
+	*p++ = cpu_to_be32(args->type);
+	*p++ = cpu_to_be32(args->range.iomode);
+	p = xdr_encode_hyper(p, args->range.offset);
+	p = xdr_encode_hyper(p, args->range.length);
+	p = xdr_encode_hyper(p, args->minlength);
+	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+				args->ctx->state);
+	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+	*p = cpu_to_be32(args->maxcount);
+
+	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+		__func__,
+		args->type,
+		args->range.iomode,
+		(unsigned long)args->range.offset,
+		(unsigned long)args->range.length,
+		args->maxcount);
+	hdr->nops++;
+	hdr->replen += decode_layoutget_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * END OF "GENERIC" ENCODE ROUTINES.
  */
@@ -2554,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
 	return 0;
 }
 
+/*
+ * Encode GETDEVICEINFO request
+ */
+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+				      struct nfs4_getdeviceinfo_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->seq_args, &hdr);
+	encode_getdeviceinfo(&xdr, args, &hdr);
+
+	/* set up reply kvec. Subtract notification bitmap max size (2)
+	 * so that notification bitmap is put in xdr_buf tail */
+	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+			 args->pdev->pages, args->pdev->pgbase,
+			 args->pdev->pglen);
+
+	encode_nops(&hdr);
+	return 0;
+}
+
+/*
+ *  Encode LAYOUTGET request
+ */
+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+				  struct nfs4_layoutget_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->seq_args, &hdr);
+	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutget(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -4830,6 +4955,134 @@ out_overflow:
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+				struct pnfs_device *pdev)
+{
+	__be32 *p;
+	uint32_t len, type;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+	if (status) {
+		if (status == -ETOOSMALL) {
+			p = xdr_inline_decode(xdr, 4);
+			if (unlikely(!p))
+				goto out_overflow;
+			pdev->mincount = be32_to_cpup(p);
+			dprintk("%s: Min count too small. mincnt = %u\n",
+				__func__, pdev->mincount);
+		}
+		return status;
+	}
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	type = be32_to_cpup(p++);
+	if (type != pdev->layout_type) {
+		dprintk("%s: layout mismatch req: %u pdev: %u\n",
+			__func__, pdev->layout_type, type);
+		return -EINVAL;
+	}
+	/*
+	 * Get the length of the opaque device_addr4. xdr_read_pages places
+	 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+	 * and places the remaining xdr data in xdr_buf->tail
+	 */
+	pdev->mincount = be32_to_cpup(p);
+	xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+
+	/* Parse notification bitmap, verifying that it is zero. */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len) {
+		int i;
+
+		p = xdr_inline_decode(xdr, 4 * len);
+		if (unlikely(!p))
+			goto out_overflow;
+		for (i = 0; i < len; i++, p++) {
+			if (be32_to_cpup(p)) {
+				dprintk("%s: notifications not supported\n",
+					__func__);
+				return -EIO;
+			}
+		}
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+			    struct nfs4_layoutget_res *res)
+{
+	__be32 *p;
+	int status;
+	u32 layout_count;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTGET);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->return_on_close = be32_to_cpup(p++);
+	p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+	layout_count = be32_to_cpup(p);
+	if (!layout_count) {
+		dprintk("%s: server responded with empty layout array\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	p = xdr_inline_decode(xdr, 24);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->range.offset);
+	p = xdr_decode_hyper(p, &res->range.length);
+	res->range.iomode = be32_to_cpup(p++);
+	res->type = be32_to_cpup(p++);
+
+	status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
+	if (unlikely(status))
+		return status;
+
+	dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+		__func__,
+		(unsigned long)res->range.offset,
+		(unsigned long)res->range.length,
+		res->range.iomode,
+		res->type,
+		res->layout.len);
+
+	/* nfs4_proc_layoutget allocated a single page */
+	if (res->layout.len > PAGE_SIZE)
+		return -ENOMEM;
+	memcpy(res->layout.buf, p, res->layout.len);
+
+	if (layout_count > 1) {
+		/* We only handle a length one array at the moment.  Any
+		 * further entries are just ignored.  Note that this means
+		 * the client may see a response that is less than the
+		 * minimum it requested.
+		 */
+		dprintk("%s: server responded with %d layouts, dropping tail\n",
+			__func__, layout_count);
+	}
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * END OF "GENERIC" DECODE ROUTINES.
  */
@@ -5857,6 +6110,53 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
 		status = decode_reclaim_complete(&xdr, (void *)NULL);
 	return status;
 }
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+				      struct nfs4_getdeviceinfo_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status != 0)
+		goto out;
+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	if (status != 0)
+		goto out;
+	status = decode_getdeviceinfo(&xdr, res->pdev);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+				  struct nfs4_layoutget_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(&xdr);
+	if (status)
+		goto out;
+	status = decode_layoutget(&xdr, rqstp, res);
+out:
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
@@ -6048,6 +6348,8 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(SEQUENCE,	enc_sequence,	dec_sequence),
   PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
   PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 891a0c36f992..d1ad7df3479e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -140,6 +140,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 		printk(KERN_ERR "%s id 0 is reserved\n", __func__);
 		return status;
 	}
+	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+		printk(KERN_ERR "%s Layout driver must provide "
+		       "alloc_lseg and free_lseg.\n", __func__);
+		return status;
+	}
 
 	spin_lock(&pnfs_spinlock);
 	tmp = find_pnfs_driver_locked(ld_type->id);
@@ -168,6 +173,10 @@ pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 }
 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 
+/*
+ * pNFS client layout cache
+ */
+
 static void
 get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 {
@@ -190,7 +199,7 @@ put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 	}
 }
 
-static void
+void
 put_layout_hdr(struct inode *inode)
 {
 	spin_lock(&inode->i_lock);
@@ -215,7 +224,7 @@ destroy_lseg(struct kref *kref)
 	struct inode *ino = lseg->layout->inode;
 
 	dprintk("--> %s\n", __func__);
-	kfree(lseg);
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
 	put_layout_hdr(ino);
 }
@@ -249,6 +258,9 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
 	/* List does not take a reference, so no need for put here */
 	list_del_init(&lo->layouts);
 	spin_unlock(&clp->cl_lock);
+	write_seqlock(&lo->seqlock);
+	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+	write_sequnlock(&lo->seqlock);
 
 	dprintk("%s:Return\n", __func__);
 }
@@ -307,40 +319,135 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
 	}
 }
 
-static void pnfs_insert_layout(struct pnfs_layout_hdr *lo,
-			       struct pnfs_layout_segment *lseg);
+/* update lo->stateid with new if is more recent
+ *
+ * lo->stateid could be the open stateid, in which case we just use what given.
+ */
+static void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+			const nfs4_stateid *new)
+{
+	nfs4_stateid *old = &lo->stateid;
+	bool overwrite = false;
+
+	write_seqlock(&lo->seqlock);
+	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
+		overwrite = true;
+	else {
+		u32 oldseq, newseq;
+
+		oldseq = be32_to_cpu(old->stateid.seqid);
+		newseq = be32_to_cpu(new->stateid.seqid);
+		if ((int)(newseq - oldseq) > 0)
+			overwrite = true;
+	}
+	if (overwrite)
+		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
+	write_sequnlock(&lo->seqlock);
+}
+
+static void
+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+			      struct nfs4_state *state)
+{
+	int seq;
+
+	dprintk("--> %s\n", __func__);
+	write_seqlock(&lo->seqlock);
+	do {
+		seq = read_seqbegin(&state->seqlock);
+		memcpy(lo->stateid.data, state->stateid.data,
+		       sizeof(state->stateid.data));
+	} while (read_seqretry(&state->seqlock, seq));
+	set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+	write_sequnlock(&lo->seqlock);
+	dprintk("<-- %s\n", __func__);
+}
+
+void
+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			struct nfs4_state *open_state)
+{
+	int seq;
 
-/* Get layout from server. */
+	dprintk("--> %s\n", __func__);
+	do {
+		seq = read_seqbegin(&lo->seqlock);
+		if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+			/* This will trigger retry of the read */
+			pnfs_layout_from_open_stateid(lo, open_state);
+		} else
+			memcpy(dst->data, lo->stateid.data,
+			       sizeof(lo->stateid.data));
+	} while (read_seqretry(&lo->seqlock, seq));
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
 	   u32 iomode)
 {
 	struct inode *ino = lo->inode;
-	struct pnfs_layout_segment *lseg;
+	struct nfs_server *server = NFS_SERVER(ino);
+	struct nfs4_layoutget *lgp;
+	struct pnfs_layout_segment *lseg = NULL;
+
+	dprintk("--> %s\n", __func__);
 
-	/* Lets pretend we sent LAYOUTGET and got a response */
-	lseg = kzalloc(sizeof(*lseg), GFP_KERNEL);
+	BUG_ON(ctx == NULL);
+	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+	if (lgp == NULL) {
+		put_layout_hdr(lo->inode);
+		return NULL;
+	}
+	lgp->args.minlength = NFS4_MAX_UINT64;
+	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+	lgp->args.range.iomode = iomode;
+	lgp->args.range.offset = 0;
+	lgp->args.range.length = NFS4_MAX_UINT64;
+	lgp->args.type = server->pnfs_curr_ld->id;
+	lgp->args.inode = ino;
+	lgp->args.ctx = get_nfs_open_context(ctx);
+	lgp->lsegpp = &lseg;
+
+	/* Synchronously retrieve layout information from server and
+	 * store in lseg.
+	 */
+	nfs4_proc_layoutget(lgp);
 	if (!lseg) {
+		/* remember that LAYOUTGET failed and suspend trying */
 		set_bit(lo_fail_bit(iomode), &lo->state);
-		spin_lock(&ino->i_lock);
-		put_layout_hdr_locked(lo);
-		spin_unlock(&ino->i_lock);
-		return NULL;
 	}
-	init_lseg(lo, lseg);
-	lseg->iomode = IOMODE_RW;
-	spin_lock(&ino->i_lock);
-	pnfs_insert_layout(lo, lseg);
-	put_layout_hdr_locked(lo);
-	spin_unlock(&ino->i_lock);
 	return lseg;
 }
 
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(u32 iomode1, u32 iomode2)
+{
+	/* read > read/write */
+	return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+}
+
 static void
 pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 		   struct pnfs_layout_segment *lseg)
 {
+	struct pnfs_layout_segment *lp;
+	int found = 0;
+
 	dprintk("%s:Begin\n", __func__);
 
 	assert_spin_locked(&lo->inode->i_lock);
@@ -352,19 +459,28 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 		list_add_tail(&lo->layouts, &clp->cl_layouts);
 		spin_unlock(&clp->cl_lock);
 	}
-	get_layout_hdr_locked(lo);
-	/* STUB - add the constructed lseg if necessary */
-	if (list_empty(&lo->segs)) {
+	list_for_each_entry(lp, &lo->segs, fi_list) {
+		if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+			continue;
+		list_add_tail(&lseg->fi_list, &lp->fi_list);
+		dprintk("%s: inserted lseg %p "
+			"iomode %d offset %llu length %llu before "
+			"lp %p iomode %d offset %llu length %llu\n",
+			__func__, lseg, lseg->range.iomode,
+			lseg->range.offset, lseg->range.length,
+			lp, lp->range.iomode, lp->range.offset,
+			lp->range.length);
+		found = 1;
+		break;
+	}
+	if (!found) {
 		list_add_tail(&lseg->fi_list, &lo->segs);
-		dprintk("%s: inserted lseg %p iomode %d at tail\n",
-			__func__, lseg, lseg->iomode);
-	} else {
-		/* There is no harm for the moment in calling this
-		 * with the lock held, and the call will be removed
-		 * with the STUB.
-		 */
-		put_lseg(lseg);
+		dprintk("%s: inserted lseg %p "
+			"iomode %d offset %llu length %llu at tail\n",
+			__func__, lseg, lseg->range.iomode,
+			lseg->range.offset, lseg->range.length);
 	}
+	get_layout_hdr_locked(lo);
 
 	dprintk("%s:Return\n", __func__);
 }
@@ -380,6 +496,7 @@ alloc_init_layout_hdr(struct inode *ino)
 	lo->refcount = 1;
 	INIT_LIST_HEAD(&lo->layouts);
 	INIT_LIST_HEAD(&lo->segs);
+	seqlock_init(&lo->seqlock);
 	lo->inode = ino;
 	return lo;
 }
@@ -407,11 +524,46 @@ pnfs_find_alloc_layout(struct inode *ino)
 	return nfsi->layout;
 }
 
-/* STUB - LAYOUTGET never succeeds, so cache is empty */
+/*
+ * iomode matching rules:
+ * iomode	lseg	match
+ * -----	-----	-----
+ * ANY		READ	true
+ * ANY		RW	true
+ * RW		READ	false
+ * RW		RW	true
+ * READ		READ	true
+ * READ		RW	true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+{
+	return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+}
+
+/*
+ * lookup range in layout
+ */
 static struct pnfs_layout_segment *
 pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
 {
-	return NULL;
+	struct pnfs_layout_segment *lseg, *ret = NULL;
+
+	dprintk("%s:Begin\n", __func__);
+
+	assert_spin_locked(&lo->inode->i_lock);
+	list_for_each_entry(lseg, &lo->segs, fi_list) {
+		if (is_matching_lseg(lseg, iomode)) {
+			ret = lseg;
+			break;
+		}
+		if (cmp_layout(iomode, lseg->range.iomode) > 0)
+			break;
+	}
+
+	dprintk("%s:Return lseg %p ref %d\n",
+		__func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+	return ret;
 }
 
 /*
@@ -448,7 +600,7 @@ pnfs_update_layout(struct inode *ino,
 	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
 		goto out_unlock;
 
-	get_layout_hdr_locked(lo);
+	get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
 	spin_unlock(&ino->i_lock);
 
 	lseg = send_layoutget(lo, ctx, iomode);
@@ -460,3 +612,172 @@ out_unlock:
 	spin_unlock(&ino->i_lock);
 	goto out;
 }
+
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+	struct nfs4_layoutget_res *res = &lgp->res;
+	struct pnfs_layout_segment *lseg;
+	struct inode *ino = lo->inode;
+	int status = 0;
+
+	/* Inject layout blob into I/O device driver */
+	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+	if (!lseg || IS_ERR(lseg)) {
+		if (!lseg)
+			status = -ENOMEM;
+		else
+			status = PTR_ERR(lseg);
+		dprintk("%s: Could not allocate layout: error %d\n",
+		       __func__, status);
+		goto out;
+	}
+
+	spin_lock(&ino->i_lock);
+	init_lseg(lo, lseg);
+	lseg->range = res->range;
+	*lgp->lsegpp = lseg;
+	pnfs_insert_layout(lo, lseg);
+
+	/* Done processing layoutget. Set the layout stateid */
+	pnfs_set_layout_stateid(lo, &res->stateid);
+	spin_unlock(&ino->i_lock);
+out:
+	return status;
+}
+
+/*
+ * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Add layout type to the lookup key to expand to support multiple types.
+ */
+int
+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+			 void (*free_callback)(struct pnfs_deviceid_node *))
+{
+	struct pnfs_deviceid_cache *c;
+
+	c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_devid_cache != NULL) {
+		atomic_inc(&clp->cl_devid_cache->dc_ref);
+		dprintk("%s [kref [%d]]\n", __func__,
+			atomic_read(&clp->cl_devid_cache->dc_ref));
+		kfree(c);
+	} else {
+		/* kzalloc initializes hlists */
+		spin_lock_init(&c->dc_lock);
+		atomic_set(&c->dc_ref, 1);
+		c->dc_free_callback = free_callback;
+		clp->cl_devid_cache = c;
+		dprintk("%s [new]\n", __func__);
+	}
+	spin_unlock(&clp->cl_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+
+/*
+ * Called from pnfs_layoutdriver_type->free_lseg
+ * last layout segment reference frees deviceid
+ */
+void
+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+		  struct pnfs_deviceid_node *devid)
+{
+	struct nfs4_deviceid *id = &devid->de_id;
+	struct pnfs_deviceid_node *d;
+	struct hlist_node *n;
+	long h = nfs4_deviceid_hash(id);
+
+	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+		return;
+
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+		if (!memcmp(&d->de_id, id, sizeof(*id))) {
+			hlist_del_rcu(&d->de_node);
+			spin_unlock(&c->dc_lock);
+			synchronize_rcu();
+			c->dc_free_callback(devid);
+			return;
+		}
+	spin_unlock(&c->dc_lock);
+	/* Why wasn't it found in  the list? */
+	BUG();
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+
+/* Find and reference a deviceid */
+struct pnfs_deviceid_node *
+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+{
+	struct pnfs_deviceid_node *d;
+	struct hlist_node *n;
+	long hash = nfs4_deviceid_hash(id);
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+		if (!memcmp(&d->de_id, id, sizeof(*id))) {
+			if (!atomic_inc_not_zero(&d->de_ref)) {
+				goto fail;
+			} else {
+				rcu_read_unlock();
+				return d;
+			}
+		}
+	}
+fail:
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+
+/*
+ * Add a deviceid to the cache.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct pnfs_deviceid_node *
+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+{
+	struct pnfs_deviceid_node *d;
+	long hash = nfs4_deviceid_hash(&new->de_id);
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	spin_lock(&c->dc_lock);
+	d = pnfs_find_get_deviceid(c, &new->de_id);
+	if (d) {
+		spin_unlock(&c->dc_lock);
+		dprintk("%s [discard]\n", __func__);
+		c->dc_free_callback(new);
+		return d;
+	}
+	INIT_HLIST_NODE(&new->de_node);
+	atomic_set(&new->de_ref, 1);
+	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+	spin_unlock(&c->dc_lock);
+	dprintk("%s [new]\n", __func__);
+	return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+
+void
+pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
+	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+
+	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
+		int i;
+		/* Verify cache is empty */
+		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
+			BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+		clp->cl_devid_cache = NULL;
+		spin_unlock(&clp->cl_lock);
+		kfree(local);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1c3eb02f4944..cbba28cb02a7 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,7 +32,7 @@
 
 struct pnfs_layout_segment {
 	struct list_head fi_list;
-	u32 iomode;
+	struct pnfs_layout_range range;
 	struct kref kref;
 	struct pnfs_layout_hdr *layout;
 };
@@ -44,6 +44,7 @@ struct pnfs_layout_segment {
 enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
+	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */
 };
 
 /* Per-layout driver specific registration structure */
@@ -54,26 +55,96 @@ struct pnfs_layoutdriver_type {
 	struct module *owner;
 	int (*initialize_mountpoint) (struct nfs_server *);
 	int (*uninitialize_mountpoint) (struct nfs_server *);
+	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 };
 
 struct pnfs_layout_hdr {
 	unsigned long		refcount;
 	struct list_head	layouts;   /* other client layouts */
 	struct list_head	segs;      /* layout segments list */
+	seqlock_t		seqlock;   /* Protects the stateid */
+	nfs4_stateid		stateid;
 	unsigned long		state;
 	struct inode		*inode;
 };
 
+struct pnfs_device {
+	struct nfs4_deviceid dev_id;
+	unsigned int  layout_type;
+	unsigned int  mincount;
+	struct page **pages;
+	void          *area;
+	unsigned int  pgbase;
+	unsigned int  pglen;
+};
+
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS	5
+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+struct pnfs_deviceid_node {
+	struct hlist_node	de_node;
+	struct nfs4_deviceid	de_id;
+	atomic_t		de_ref;
+};
+
+struct pnfs_deviceid_cache {
+	spinlock_t		dc_lock;
+	atomic_t		dc_ref;
+	void			(*dc_free_callback)(struct pnfs_deviceid_node *);
+	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+
+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
+			void (*free_callback)(struct pnfs_deviceid_node *));
+extern void pnfs_put_deviceid_cache(struct nfs_client *);
+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
+				struct pnfs_deviceid_cache *,
+				struct nfs4_deviceid *);
+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
+				struct pnfs_deviceid_cache *,
+				struct pnfs_deviceid_node *);
+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+			      struct pnfs_deviceid_node *devid);
+
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+				   struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+
+/* pnfs.c */
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct inode *inode);
+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			     struct nfs4_state *open_state);
 
 
 static inline int lo_fail_bit(u32 iomode)
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 34da32436ac0..a9683d6acaa4 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -545,6 +545,8 @@ enum {
 	NFSPROC4_CLNT_SEQUENCE,
 	NFSPROC4_CLNT_GET_LEASE_TIME,
 	NFSPROC4_CLNT_RECLAIM_COMPLETE,
+	NFSPROC4_CLNT_LAYOUTGET,
+	NFSPROC4_CLNT_GETDEVICEINFO,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4d62f1581ed1..452d96436d26 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -83,6 +83,7 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_layouts;
+	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_FSCACHE
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 065f9d105d05..ba6cc8f223c9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -187,6 +187,55 @@ struct nfs4_get_lease_time_res {
 	struct nfs4_sequence_res	lr_seq_res;
 };
 
+#define PNFS_LAYOUT_MAXSIZE 4096
+
+struct nfs4_layoutdriver_data {
+	__u32 len;
+	void *buf;
+};
+
+struct pnfs_layout_range {
+	u32 iomode;
+	u64 offset;
+	u64 length;
+};
+
+struct nfs4_layoutget_args {
+	__u32 type;
+	struct pnfs_layout_range range;
+	__u64 minlength;
+	__u32 maxcount;
+	struct inode *inode;
+	struct nfs_open_context *ctx;
+	struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutget_res {
+	__u32 return_on_close;
+	struct pnfs_layout_range range;
+	__u32 type;
+	nfs4_stateid stateid;
+	struct nfs4_layoutdriver_data layout;
+	struct nfs4_sequence_res seq_res;
+};
+
+struct nfs4_layoutget {
+	struct nfs4_layoutget_args args;
+	struct nfs4_layoutget_res res;
+	struct pnfs_layout_segment **lsegpp;
+	int status;
+};
+
+struct nfs4_getdeviceinfo_args {
+	struct pnfs_device *pdev;
+	struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_getdeviceinfo_res {
+	struct pnfs_device *pdev;
+	struct nfs4_sequence_res seq_res;
+};
+
 /*
  * Arguments to the open call.
  */
-- 
cgit v1.2.3


From 16b374ca439fb406e46e071f75428f5b033056f8 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 20 Oct 2010 00:18:04 -0400
Subject: NFSv4.1: pnfs: filelayout: add driver's LAYOUTGET and GETDEVICEINFO
 infrastructure

Implement the driver's io_ops->alloc_lseg and free_lseg functions,
which integrate into the deviceid cache and calls out to
nfs4_proc_getdeviceinfo when necessary.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Dean Hildebrand <dhildebz@umich.edu>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Ricardo Labiaga <ricardo.labiaga@netapp.com>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile            |   2 +-
 fs/nfs/client.c            |   1 +
 fs/nfs/nfs4filelayout.c    | 204 ++++++++++++++++++++-
 fs/nfs/nfs4filelayout.h    |  94 ++++++++++
 fs/nfs/nfs4filelayoutdev.c | 448 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 747 insertions(+), 2 deletions(-)
 create mode 100644 fs/nfs/nfs4filelayout.h
 create mode 100644 fs/nfs/nfs4filelayoutdev.c

(limited to 'fs')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 08a8889c5149..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -20,4 +20,4 @@ nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
 
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
-nfs_layout_nfsv41_files-y := nfs4filelayout.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e55ad211dbed..8934b2e339df 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -255,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
 		nfs_free_client(clp);
 	}
 }
+EXPORT_SYMBOL_GPL(nfs_put_client);
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 696e283c2632..9e82b51cb515 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -30,7 +30,9 @@
  */
 
 #include <linux/nfs_fs.h>
-#include "pnfs.h"
+
+#include "internal.h"
+#include "nfs4filelayout.h"
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
@@ -41,23 +43,223 @@ MODULE_DESCRIPTION("The NFSv4 file layout driver");
 int
 filelayout_initialize_mountpoint(struct nfs_server *nfss)
 {
+	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+						nfs4_fl_free_deviceid_callback);
+	if (status) {
+		printk(KERN_WARNING "%s: deviceid cache could not be "
+			"initialized\n", __func__);
+		return status;
+	}
+	dprintk("%s: deviceid cache has been initialized successfully\n",
+		__func__);
 	return 0;
 }
 
+/* Uninitialize a mountpoint by destroying its device list */
 int
 filelayout_uninitialize_mountpoint(struct nfs_server *nfss)
 {
 	dprintk("--> %s\n", __func__);
 
+	if (nfss->nfs_client->cl_devid_cache)
+		pnfs_put_deviceid_cache(nfss->nfs_client);
 	return 0;
 }
 
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+			struct nfs4_filelayout_segment *fl,
+			struct nfs4_layoutget_res *lgr,
+			struct nfs4_deviceid *id)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+	int status = -EINVAL;
+	struct nfs_server *nfss = NFS_SERVER(lo->inode);
+
+	dprintk("--> %s\n", __func__);
+
+	if (fl->pattern_offset > lgr->range.offset) {
+		dprintk("%s pattern_offset %lld to large\n",
+				__func__, fl->pattern_offset);
+		goto out;
+	}
+
+	if (fl->stripe_unit % PAGE_SIZE) {
+		dprintk("%s Stripe unit (%u) not page aligned\n",
+			__func__, fl->stripe_unit);
+		goto out;
+	}
+
+	/* find and reference the deviceid */
+	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+	if (dsaddr == NULL) {
+		dsaddr = get_device_info(lo->inode, id);
+		if (dsaddr == NULL)
+			goto out;
+	}
+	fl->dsaddr = dsaddr;
+
+	if (fl->first_stripe_index < 0 ||
+	    fl->first_stripe_index >= dsaddr->stripe_count) {
+		dprintk("%s Bad first_stripe_index %d\n",
+				__func__, fl->first_stripe_index);
+		goto out_put;
+	}
+
+	if ((fl->stripe_type == STRIPE_SPARSE &&
+	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+	    (fl->stripe_type == STRIPE_DENSE &&
+	    fl->num_fh != dsaddr->stripe_count)) {
+		dprintk("%s num_fh %u not valid for given packing\n",
+			__func__, fl->num_fh);
+		goto out_put;
+	}
+
+	if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+		dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+			"wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+			nfss->wsize);
+	}
+
+	status = 0;
+out:
+	dprintk("--> %s returns %d\n", __func__, status);
+	return status;
+out_put:
+	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+	goto out;
+}
+
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+	int i;
+
+	for (i = 0; i < fl->num_fh; i++) {
+		if (!fl->fh_array[i])
+			break;
+		kfree(fl->fh_array[i]);
+	}
+	kfree(fl->fh_array);
+	fl->fh_array = NULL;
+}
+
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+	filelayout_free_fh_array(fl);
+	kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+			 struct nfs4_filelayout_segment *fl,
+			 struct nfs4_layoutget_res *lgr,
+			 struct nfs4_deviceid *id)
+{
+	uint32_t *p = (uint32_t *)lgr->layout.buf;
+	uint32_t nfl_util;
+	int i;
+
+	dprintk("%s: set_layout_map Begin\n", __func__);
+
+	memcpy(id, p, sizeof(*id));
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+	print_deviceid(id);
+
+	nfl_util = be32_to_cpup(p++);
+	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+		fl->commit_through_mds = 1;
+	if (nfl_util & NFL4_UFLG_DENSE)
+		fl->stripe_type = STRIPE_DENSE;
+	else
+		fl->stripe_type = STRIPE_SPARSE;
+	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+	fl->first_stripe_index = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &fl->pattern_offset);
+	fl->num_fh = be32_to_cpup(p++);
+
+	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+		__func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+		fl->pattern_offset);
+
+	fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+			       GFP_KERNEL);
+	if (!fl->fh_array)
+		return -ENOMEM;
+
+	for (i = 0; i < fl->num_fh; i++) {
+		/* Do we want to use a mempool here? */
+		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+		if (!fl->fh_array[i]) {
+			filelayout_free_fh_array(fl);
+			return -ENOMEM;
+		}
+		fl->fh_array[i]->size = be32_to_cpup(p++);
+		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+			printk(KERN_ERR "Too big fh %d received %d\n",
+			       i, fl->fh_array[i]->size);
+			filelayout_free_fh_array(fl);
+			return -EIO;
+		}
+		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+		p += XDR_QUADLEN(fl->fh_array[i]->size);
+		dprintk("DEBUG: %s: fh len %d\n", __func__,
+			fl->fh_array[i]->size);
+	}
+
+	return 0;
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+		      struct nfs4_layoutget_res *lgr)
+{
+	struct nfs4_filelayout_segment *fl;
+	int rc;
+	struct nfs4_deviceid id;
+
+	dprintk("--> %s\n", __func__);
+	fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+	if (!fl)
+		return NULL;
+
+	rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+		_filelayout_free_lseg(fl);
+		return NULL;
+	}
+	return &fl->generic_hdr;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+	dprintk("--> %s\n", __func__);
+	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+			  &fl->dsaddr->deviceid);
+	_filelayout_free_lseg(fl);
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 	.id = LAYOUT_NFSV4_1_FILES,
 	.name = "LAYOUT_NFSV4_1_FILES",
 	.owner = THIS_MODULE,
 	.initialize_mountpoint   = filelayout_initialize_mountpoint,
 	.uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
+	.alloc_lseg              = filelayout_alloc_lseg,
+	.free_lseg               = filelayout_free_lseg,
 };
 
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "pnfs.h"
+
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+	STRIPE_SPARSE = 1,
+	STRIPE_DENSE = 2
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+	u32			ds_ip_addr;
+	u32			ds_port;
+	struct nfs_client	*ds_clp;
+	atomic_t		ds_count;
+};
+
+struct nfs4_file_layout_dsaddr {
+	struct pnfs_deviceid_node	deviceid;
+	u32				stripe_count;
+	u8				*stripe_indices;
+	u32				ds_num;
+	struct nfs4_pnfs_ds		*ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+	struct pnfs_layout_segment generic_hdr;
+	u32 stripe_type;
+	u32 commit_through_mds;
+	u32 stripe_unit;
+	u32 first_stripe_index;
+	u64 pattern_offset;
+	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+	unsigned int num_fh;
+	struct nfs_fh **fh_array;
+};
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg,
+			    struct nfs4_filelayout_segment,
+			    generic_hdr);
+}
+
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..51fe64ace55a
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+	if (ds == NULL) {
+		printk("%s NULL device\n", __func__);
+		return;
+	}
+	printk("        ip_addr %x port %hu\n"
+		"        ref count %d\n"
+		"        client %p\n"
+		"        cl_exchange_flags %x\n",
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+		atomic_read(&ds->ds_count), ds->ds_clp,
+		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	int i;
+
+	ifdebug(FACILITY) {
+		printk("%s dsaddr->ds_num %d\n", __func__,
+		       dsaddr->ds_num);
+		for (i = 0; i < dsaddr->ds_num; i++)
+			print_ds(dsaddr->ds_list[i]);
+	}
+}
+
+void print_deviceid(struct nfs4_deviceid *id)
+{
+	u32 *p = (u32 *)id;
+
+	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+}
+
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+	struct nfs4_pnfs_ds *ds;
+
+	dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+			ntohl(ip_addr), ntohs(port));
+
+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+		if (ds->ds_ip_addr == ip_addr &&
+		    ds->ds_port == port) {
+			return ds;
+		}
+	}
+	return NULL;
+}
+
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+	dprintk("--> %s\n", __func__);
+	ifdebug(FACILITY)
+		print_ds(ds);
+
+	if (ds->ds_clp)
+		nfs_put_client(ds->ds_clp);
+	kfree(ds);
+}
+
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	struct nfs4_pnfs_ds *ds;
+	int i;
+
+	print_deviceid(&dsaddr->deviceid.de_id);
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		ds = dsaddr->ds_list[i];
+		if (ds != NULL) {
+			if (atomic_dec_and_lock(&ds->ds_count,
+						&nfs4_ds_cache_lock)) {
+				list_del_init(&ds->ds_node);
+				spin_unlock(&nfs4_ds_cache_lock);
+				destroy_ds(ds);
+			}
+		}
+	}
+	kfree(dsaddr->stripe_indices);
+	kfree(dsaddr);
+}
+
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr =
+		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+
+	nfs4_fl_free_deviceid(dsaddr);
+}
+
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+	struct nfs4_pnfs_ds *tmp_ds, *ds;
+
+	ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+	if (!ds)
+		goto out;
+
+	spin_lock(&nfs4_ds_cache_lock);
+	tmp_ds = _data_server_lookup_locked(ip_addr, port);
+	if (tmp_ds == NULL) {
+		ds->ds_ip_addr = ip_addr;
+		ds->ds_port = port;
+		atomic_set(&ds->ds_count, 1);
+		INIT_LIST_HEAD(&ds->ds_node);
+		ds->ds_clp = NULL;
+		list_add(&ds->ds_node, &nfs4_data_server_cache);
+		dprintk("%s add new data server ip 0x%x\n", __func__,
+			ds->ds_ip_addr);
+	} else {
+		kfree(ds);
+		atomic_inc(&tmp_ds->ds_count);
+		dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+			__func__, tmp_ds->ds_ip_addr,
+			atomic_read(&tmp_ds->ds_count));
+		ds = tmp_ds;
+	}
+	spin_unlock(&nfs4_ds_cache_lock);
+out:
+	return ds;
+}
+
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+	struct nfs4_pnfs_ds *ds = NULL;
+	char *buf;
+	const char *ipend, *pstr;
+	u32 ip_addr, port;
+	int nlen, rlen, i;
+	int tmp[2];
+	__be32 *r_netid, *r_addr, *p = *pp;
+
+	/* r_netid */
+	nlen = be32_to_cpup(p++);
+	r_netid = p;
+	p += XDR_QUADLEN(nlen);
+
+	/* r_addr */
+	rlen = be32_to_cpup(p++);
+	r_addr = p;
+	p += XDR_QUADLEN(rlen);
+	*pp = p;
+
+	/* Check that netid is "tcp" */
+	if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+		dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+		goto out_err;
+	}
+
+	/* ipv6 length plus port is legal */
+	if (rlen > INET6_ADDRSTRLEN + 8) {
+		dprintk("%s Invalid address, length %d\n", __func__,
+			rlen);
+		goto out_err;
+	}
+	buf = kmalloc(rlen + 1, GFP_KERNEL);
+	buf[rlen] = '\0';
+	memcpy(buf, r_addr, rlen);
+
+	/* replace the port dots with dashes for the in4_pton() delimiter*/
+	for (i = 0; i < 2; i++) {
+		char *res = strrchr(buf, '.');
+		*res = '-';
+	}
+
+	/* Currently only support ipv4 address */
+	if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+		dprintk("%s: Only ipv4 addresses supported\n", __func__);
+		goto out_free;
+	}
+
+	/* port */
+	pstr = ipend;
+	sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+	port = htons((tmp[0] << 8) | (tmp[1]));
+
+	ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+	dprintk("%s Decoded address and port %s\n", __func__, buf);
+out_free:
+	kfree(buf);
+out_err:
+	return ds;
+}
+
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+	int i, dummy;
+	u32 cnt, num;
+	u8 *indexp;
+	__be32 *p = (__be32 *)pdev->area, *indicesp;
+	struct nfs4_file_layout_dsaddr *dsaddr;
+
+	/* Get the stripe count (number of stripe index) */
+	cnt = be32_to_cpup(p++);
+	dprintk("%s stripe count  %d\n", __func__, cnt);
+	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+		printk(KERN_WARNING "%s: stripe count %d greater than "
+		       "supported maximum %d\n", __func__,
+			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+		goto out_err;
+	}
+
+	/* Check the multipath list count */
+	indicesp = p;
+	p += XDR_QUADLEN(cnt << 2);
+	num = be32_to_cpup(p++);
+	dprintk("%s ds_num %u\n", __func__, num);
+	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+		printk(KERN_WARNING "%s: multipath count %d greater than "
+			"supported maximum %d\n", __func__,
+			num, NFS4_PNFS_MAX_MULTI_CNT);
+		goto out_err;
+	}
+	dsaddr = kzalloc(sizeof(*dsaddr) +
+			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+			GFP_KERNEL);
+	if (!dsaddr)
+		goto out_err;
+
+	dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+	if (!dsaddr->stripe_indices)
+		goto out_err_free;
+
+	dsaddr->stripe_count = cnt;
+	dsaddr->ds_num = num;
+
+	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+
+	/* Go back an read stripe indices */
+	p = indicesp;
+	indexp = &dsaddr->stripe_indices[0];
+	for (i = 0; i < dsaddr->stripe_count; i++) {
+		*indexp = be32_to_cpup(p++);
+		if (*indexp >= num)
+			goto out_err_free;
+		indexp++;
+	}
+	/* Skip already read multipath list count */
+	p++;
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		int j;
+
+		dummy = be32_to_cpup(p++); /* multipath count */
+		if (dummy > 1) {
+			printk(KERN_WARNING
+			       "%s: Multipath count %d not supported, "
+			       "skipping all greater than 1\n", __func__,
+				dummy);
+		}
+		for (j = 0; j < dummy; j++) {
+			if (j == 0) {
+				dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+				if (dsaddr->ds_list[i] == NULL)
+					goto out_err_free;
+			} else {
+				u32 len;
+				/* skip extra multipath */
+				len = be32_to_cpup(p++);
+				p += XDR_QUADLEN(len);
+				len = be32_to_cpup(p++);
+				p += XDR_QUADLEN(len);
+				continue;
+			}
+		}
+	}
+	return dsaddr;
+
+out_err_free:
+	nfs4_fl_free_deviceid(dsaddr);
+out_err:
+	dprintk("%s ERROR: returning NULL\n", __func__);
+	return NULL;
+}
+
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
+ */
+static struct nfs4_file_layout_dsaddr*
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+	struct pnfs_deviceid_node *d;
+
+	dsaddr = decode_device(inode, dev);
+	if (!dsaddr) {
+		printk(KERN_WARNING "%s: Could not decode or add device\n",
+			__func__);
+		return NULL;
+	}
+
+	d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+			      &dsaddr->deviceid);
+
+	return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
+	struct pnfs_device *pdev = NULL;
+	u32 max_resp_sz;
+	int max_pages;
+	struct page **pages = NULL;
+	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+	int rc, i;
+	struct nfs_server *server = NFS_SERVER(inode);
+
+	/*
+	 * Use the session max response size as the basis for setting
+	 * GETDEVICEINFO's maxcount
+	 */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	max_pages = max_resp_sz >> PAGE_SHIFT;
+	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+		__func__, inode, max_resp_sz, max_pages);
+
+	pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
+	if (pdev == NULL)
+		return NULL;
+
+	pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL) {
+		kfree(pdev);
+		return NULL;
+	}
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto out_free;
+	}
+
+	/* set pdev->area */
+	pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+	if (!pdev->area)
+		goto out_free;
+
+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+	pdev->pages = pages;
+	pdev->pgbase = 0;
+	pdev->pglen = PAGE_SIZE * max_pages;
+	pdev->mincount = 0;
+
+	rc = nfs4_proc_getdeviceinfo(server, pdev);
+	dprintk("%s getdevice info returns %d\n", __func__, rc);
+	if (rc)
+		goto out_free;
+
+	/*
+	 * Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	dsaddr = decode_and_add_device(inode, pdev);
+out_free:
+	if (pdev->area != NULL)
+		vunmap(pdev->area);
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+	kfree(pdev);
+	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+	return dsaddr;
+}
+
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+{
+	struct pnfs_deviceid_node *d;
+
+	d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+	return (d == NULL) ? NULL :
+		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
-- 
cgit v1.2.3


From 1c787096fce217b5fdd9806dbce96e738c9345c0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 21 Oct 2010 16:56:48 -0400
Subject: NFSv4.1: Use more sensible names for 'initialize_mountpoint'

The initialize_mountpoint/uninitialise_mountpoint functions are really about
setting or clearing the layout driver to be used on this filesystem. Change
the names to the more descriptive 'set_layoutdriver/clear_layoutdriver'.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c | 14 +++++++-------
 fs/nfs/pnfs.c           |  4 ++--
 fs/nfs/pnfs.h           |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 9e82b51cb515..2e92f0d8d654 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,8 +40,8 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
 
-int
-filelayout_initialize_mountpoint(struct nfs_server *nfss)
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
 {
 	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
 						nfs4_fl_free_deviceid_callback);
@@ -55,9 +55,9 @@ filelayout_initialize_mountpoint(struct nfs_server *nfss)
 	return 0;
 }
 
-/* Uninitialize a mountpoint by destroying its device list */
-int
-filelayout_uninitialize_mountpoint(struct nfs_server *nfss)
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
 {
 	dprintk("--> %s\n", __func__);
 
@@ -256,8 +256,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.id = LAYOUT_NFSV4_1_FILES,
 	.name = "LAYOUT_NFSV4_1_FILES",
 	.owner = THIS_MODULE,
-	.initialize_mountpoint   = filelayout_initialize_mountpoint,
-	.uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
+	.set_layoutdriver = filelayout_set_layoutdriver,
+	.clear_layoutdriver = filelayout_clear_layoutdriver,
 	.alloc_lseg              = filelayout_alloc_lseg,
 	.free_lseg               = filelayout_free_lseg,
 };
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d1ad7df3479e..db773428f95f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -75,7 +75,7 @@ void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
 	if (nfss->pnfs_curr_ld) {
-		nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss);
+		nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
 		module_put(nfss->pnfs_curr_ld->owner);
 	}
 	nfss->pnfs_curr_ld = NULL;
@@ -115,7 +115,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
 		goto out_no_driver;
 	}
 	server->pnfs_curr_ld = ld_type;
-	if (ld_type->initialize_mountpoint(server)) {
+	if (ld_type->set_layoutdriver(server)) {
 		printk(KERN_ERR
 		       "%s: Error initializing mount point for layout driver %u.\n",
 		       __func__, id);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index cbba28cb02a7..e12367d50489 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -53,8 +53,8 @@ struct pnfs_layoutdriver_type {
 	const u32 id;
 	const char *name;
 	struct module *owner;
-	int (*initialize_mountpoint) (struct nfs_server *);
-	int (*uninitialize_mountpoint) (struct nfs_server *);
+	int (*set_layoutdriver) (struct nfs_server *);
+	int (*clear_layoutdriver) (struct nfs_server *);
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 };
-- 
cgit v1.2.3


From 3866f673ebd86e5be2533923f5c0aed91fe1669f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Wed, 1 Sep 2010 18:03:41 +0200
Subject: jffs2: use cond_resched() instead of yield()

yield() has different semantics meanwhile and even causes RT-kernels to
BUG. Replace the only appearance left in jffs2.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/erase.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index abac961f617b..e513f1913c15 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 		}
 
 		/* Be nice */
-		yield();
+		cond_resched();
 		mutex_lock(&c->erase_free_sem);
 		spin_lock(&c->erase_completion_lock);
 	}
-- 
cgit v1.2.3


From 088bd455c954c0c42edde9d4463e44be10101aac Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 22 Sep 2010 22:52:33 -0400
Subject: jffs2: drop unused model argument

The jffs2 compression framework provides a "model" argument when
compressing and decompressing, but the caller always passes in NULL
and the callees never use it.  So punt this useless overhead.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/compr.c       |  6 +++---
 fs/jffs2/compr.h       |  4 ++--
 fs/jffs2/compr_lzo.c   |  4 ++--
 fs/jffs2/compr_rtime.c |  6 ++----
 fs/jffs2/compr_rubin.c | 11 ++++-------
 fs/jffs2/compr_zlib.c  |  6 ++----
 6 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 617a1e5694c1..de4247021d25 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			spin_unlock(&jffs2_compressor_list_lock);
 			*datalen  = orig_slen;
 			*cdatalen = orig_dlen;
-			compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL);
+			compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
 			spin_lock(&jffs2_compressor_list_lock);
 			this->usecount--;
 			if (!compr_ret) {
@@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			spin_unlock(&jffs2_compressor_list_lock);
 			*datalen  = orig_slen;
 			*cdatalen = orig_dlen;
-			compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL);
+			compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
 			spin_lock(&jffs2_compressor_list_lock);
 			this->usecount--;
 			if (!compr_ret) {
@@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			if (comprtype == this->compr) {
 				this->usecount++;
 				spin_unlock(&jffs2_compressor_list_lock);
-				ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL);
+				ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
 				spin_lock(&jffs2_compressor_list_lock);
 				if (ret) {
 					printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index e471a9106fd9..13bb7597ab39 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -49,9 +49,9 @@ struct jffs2_compressor {
 	char *name;
 	char compr;			/* JFFS2_COMPR_XXX */
 	int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
-			uint32_t *srclen, uint32_t *destlen, void *model);
+			uint32_t *srclen, uint32_t *destlen);
 	int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
-			  uint32_t cdatalen, uint32_t datalen, void *model);
+			  uint32_t cdatalen, uint32_t datalen);
 	int usecount;
 	int disabled;		/* if set the compressor won't compress */
 	unsigned char *compr_buf;	/* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index ed25ae7c98eb..af186ee674d8 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -42,7 +42,7 @@ static int __init alloc_workspace(void)
 }
 
 static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
-			      uint32_t *sourcelen, uint32_t *dstlen, void *model)
+			      uint32_t *sourcelen, uint32_t *dstlen)
 {
 	size_t compress_size;
 	int ret;
@@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 }
 
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-				 uint32_t srclen, uint32_t destlen, void *model)
+				 uint32_t srclen, uint32_t destlen)
 {
 	size_t dl = destlen;
 	int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 9696ad9ef5f7..16a5047903a6 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -31,8 +31,7 @@
 /* _compress returns the compressed size, -1 if bigger */
 static int jffs2_rtime_compress(unsigned char *data_in,
 				unsigned char *cpage_out,
-				uint32_t *sourcelen, uint32_t *dstlen,
-				void *model)
+				uint32_t *sourcelen, uint32_t *dstlen)
 {
 	short positions[256];
 	int outpos = 0;
@@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
 
 static int jffs2_rtime_decompress(unsigned char *data_in,
 				  unsigned char *cpage_out,
-				  uint32_t srclen, uint32_t destlen,
-				  void *model)
+				  uint32_t srclen, uint32_t destlen)
 {
 	short positions[256];
 	int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index a12b4f763373..9e7cec808c4c 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 #if 0
 /* _compress returns the compressed size, -1 if bigger */
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
-		   uint32_t *sourcelen, uint32_t *dstlen, void *model)
+		   uint32_t *sourcelen, uint32_t *dstlen)
 {
 	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
 				 cpage_out, sourcelen, dstlen);
@@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
 				   unsigned char *cpage_out,
-				   uint32_t *sourcelen, uint32_t *dstlen,
-				   void *model)
+				   uint32_t *sourcelen, uint32_t *dstlen)
 {
 	int bits[8];
 	unsigned char histo[256];
@@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
 
 static int jffs2_rubinmips_decompress(unsigned char *data_in,
 				      unsigned char *cpage_out,
-				      uint32_t sourcelen, uint32_t dstlen,
-				      void *model)
+				      uint32_t sourcelen, uint32_t dstlen)
 {
 	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
 			    cpage_out, sourcelen, dstlen);
@@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
 
 static int jffs2_dynrubin_decompress(unsigned char *data_in,
 				     unsigned char *cpage_out,
-				     uint32_t sourcelen, uint32_t dstlen,
-				     void *model)
+				     uint32_t sourcelen, uint32_t dstlen)
 {
 	int bits[8];
 	int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 97fc45de6f81..fd05a0b9431d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -68,8 +68,7 @@ static void free_workspaces(void)
 
 static int jffs2_zlib_compress(unsigned char *data_in,
 			       unsigned char *cpage_out,
-			       uint32_t *sourcelen, uint32_t *dstlen,
-			       void *model)
+			       uint32_t *sourcelen, uint32_t *dstlen)
 {
 	int ret;
 
@@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
 
 static int jffs2_zlib_decompress(unsigned char *data_in,
 				 unsigned char *cpage_out,
-				 uint32_t srclen, uint32_t destlen,
-				 void *model)
+				 uint32_t srclen, uint32_t destlen)
 {
 	int ret;
 	int wbits = MAX_WBITS;
-- 
cgit v1.2.3


From 65e5a0e18e5fb5bc6cfabd8ef4b9fc1c8569ba62 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Thu, 7 Oct 2010 19:14:02 +0100
Subject: jffs2: Dynamically choose inocache hash size

When JFFS2 is used for large volumes, the mount times are quite long.
Increasing the hash size provides a significant speed boost on the OLPC
XO-1 laptop.

Add logic that dynamically selects a hash size based on the size of
the medium. A 64mb medium will result in a hash size of 128, and a 512mb
medium will result in a hash size of 1024.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/build.c       |  2 +-
 fs/jffs2/fs.c          | 22 +++++++++++++++++++++-
 fs/jffs2/jffs2_fs_sb.h |  1 +
 fs/jffs2/nodelist.c    |  8 ++++----
 fs/jffs2/nodelist.h    |  3 ++-
 5 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a906f538d11c..85c6be2db02f 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
 static inline struct jffs2_inode_cache *
 first_inode_chain(int *i, struct jffs2_sb_info *c)
 {
-	for (; *i < INOCACHE_HASHSIZE; (*i)++) {
+	for (; *i < c->inocache_hashsize; (*i)++) {
 		if (c->inocache_list[*i])
 			return c->inocache_list[*i];
 	}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..2701b372da78 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -478,6 +478,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 	return inode;
 }
 
+static int calculate_inocache_hashsize(uint32_t flash_size)
+{
+	/*
+	 * Pick a inocache hash size based on the size of the medium.
+	 * Count how many megabytes we're dealing with, apply a hashsize twice
+	 * that size, but rounding down to the usual big powers of 2. And keep
+	 * to sensible bounds.
+	 */
+
+	int size_mb = flash_size / 1024 / 1024;
+	int hashsize = (size_mb * 2) & ~0x3f;
+
+	if (hashsize < INOCACHE_HASHSIZE_MIN)
+		return INOCACHE_HASHSIZE_MIN;
+	if (hashsize > INOCACHE_HASHSIZE_MAX)
+		return INOCACHE_HASHSIZE_MAX;
+
+	return hashsize;
+}
 
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 {
@@ -524,7 +543,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	if (ret)
 		return ret;
 
-	c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
+	c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
+	c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
 	if (!c->inocache_list) {
 		ret = -ENOMEM;
 		goto out_wbuf;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 6784bc89add1..f864005de64c 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
 	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
 
 	wait_queue_head_t inocache_wq;
+	int inocache_hashsize;
 	struct jffs2_inode_cache **inocache_list;
 	spinlock_t inocache_lock;
 
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd138469..5e03233c2363 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
 {
 	struct jffs2_inode_cache *ret;
 
-	ret = c->inocache_list[ino % INOCACHE_HASHSIZE];
+	ret = c->inocache_list[ino % c->inocache_hashsize];
 	while (ret && ret->ino < ino) {
 		ret = ret->next;
 	}
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
 
 	dbg_inocache("add %p (ino #%u)\n", new, new->ino);
 
-	prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE];
+	prev = &c->inocache_list[new->ino % c->inocache_hashsize];
 
 	while ((*prev) && (*prev)->ino < new->ino) {
 		prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
 	dbg_inocache("del %p (ino #%u)\n", old, old->ino);
 	spin_lock(&c->inocache_lock);
 
-	prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE];
+	prev = &c->inocache_list[old->ino % c->inocache_hashsize];
 
 	while ((*prev) && (*prev)->ino < old->ino) {
 		prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
 	int i;
 	struct jffs2_inode_cache *this, *next;
 
-	for (i=0; i<INOCACHE_HASHSIZE; i++) {
+	for (i=0; i < c->inocache_hashsize; i++) {
 		this = c->inocache_list[i];
 		while (this) {
 			next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 523a91691052..5a53d9bdb2b5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -199,7 +199,8 @@ struct jffs2_inode_cache {
 #define RAWNODE_CLASS_XATTR_DATUM	1
 #define RAWNODE_CLASS_XATTR_REF		2
 
-#define INOCACHE_HASHSIZE 128
+#define INOCACHE_HASHSIZE_MIN 128
+#define INOCACHE_HASHSIZE_MAX 1024
 
 #define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
 
-- 
cgit v1.2.3


From 81cfc9f1f4ad8d335367bb393bd042cc45b00047 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Thu, 7 Oct 2010 18:01:44 +0200
Subject: jffs2: Fix serious write stall due to erase

Drop the alloc_sem before erasing flash in
jffs2_garbage_collect_pass().
Otherwise writes are put on hold until the erase
has finised.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/gc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 846a79452497..31dce611337c 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	if (!list_empty(&c->erase_complete_list) ||
 	    !list_empty(&c->erase_pending_list)) {
 		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->alloc_sem);
 		D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
-		if (jffs2_erase_pending_blocks(c, 1)) {
-			mutex_unlock(&c->alloc_sem);
+		if (jffs2_erase_pending_blocks(c, 1))
 			return 0;
-		}
+
 		D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
 		spin_lock(&c->erase_completion_lock);
+		mutex_lock(&c->alloc_sem);
 	}
 
 	/* First, work out which block we're garbage-collecting */
-- 
cgit v1.2.3


From 41bdc602eca8738d6f3c71235744f72d888fe6b4 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Thu, 7 Oct 2010 19:09:34 +0200
Subject: jffs2: Reduce excessive scan of empty blocks

Scanning 1024 bytes to see if an EB is empty is a bit much.
Lower it to 256 bytes and make sure the while loop is
optimized.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/scan.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc36..b632dddcb482 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
 #include "summary.h"
 #include "debug.h"
 
-#define DEFAULT_EMPTY_SCAN_SIZE 1024
+#define DEFAULT_EMPTY_SCAN_SIZE 256
 
 #define noisy_printk(noise, args...) do { \
 	if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
 	struct jffs2_unknown_node *node;
 	struct jffs2_unknown_node crcnode;
-	uint32_t ofs, prevofs;
+	uint32_t ofs, prevofs, max_ofs;
 	uint32_t hdr_crc, buf_ofs, buf_len;
 	int err;
 	int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 
 	/* We temporarily use 'ofs' as a pointer into the buffer/jeb */
 	ofs = 0;
-
-	/* Scan only 4KiB of 0xFF before declaring it's empty */
-	while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
+	max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
+	/* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
+	while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
 		ofs += 4;
 
-	if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) {
+	if (ofs == max_ofs) {
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 		if (jffs2_cleanmarker_oob(c)) {
 			/* scan oob, take care of cleanmarker */
-- 
cgit v1.2.3


From 5f6dbc9e4afe4d1d39e85de3ac2720a2042ad191 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:06 -0400
Subject: cifs: convert cifsFileInfo->count to non-atomic counter

The count for cifsFileInfo is currently an atomic, but that just adds
complexity for little value. We generally need to hold cifs_file_list_lock
to traverse the lists anyway so we might as well make this counter
non-atomic and simply use the cifs_file_list_lock to protect it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 9 ++++++---
 fs/cifs/file.c     | 8 +++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 3365e77f6f24..6319db025e7b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -395,16 +395,19 @@ struct cifsFileInfo {
 	struct list_head llist; /* list of byte range locks we have. */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool oplock_break_cancelled:1;
-	atomic_t count;		/* reference count */
+	int count;		/* refcount -- protected by cifs_file_list_lock */
 	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
 	struct cifs_search_info srch_inf;
 	struct work_struct oplock_break; /* work for oplock breaks */
 };
 
-/* Take a reference on the file private data */
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
 static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 {
-	atomic_inc(&cifs_file->count);
+	++cifs_file->count;
 }
 
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8c81e7b14d53..baf4b5067ff9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -232,6 +232,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	if (pCifsFile == NULL)
 		return pCifsFile;
 
+	pCifsFile->count = 1;
 	pCifsFile->netfid = fileHandle;
 	pCifsFile->pid = current->tgid;
 	pCifsFile->uid = current_fsuid();
@@ -242,7 +243,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
-	atomic_set(&pCifsFile->count, 1);
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
 
 	spin_lock(&cifs_file_list_lock);
@@ -267,7 +267,8 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 
 /*
  * Release a reference on the file private data. This may involve closing
- * the filehandle out on the server.
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
  */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
@@ -276,7 +277,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	struct cifsLockInfo *li, *tmp;
 
 	spin_lock(&cifs_file_list_lock);
-	if (!atomic_dec_and_test(&cifs_file->count)) {
+	if (--cifs_file->count > 0) {
 		spin_unlock(&cifs_file_list_lock);
 		return;
 	}
@@ -2322,6 +2323,7 @@ void cifs_oplock_break(struct work_struct *work)
 	cifs_oplock_break_put(cfile);
 }
 
+/* must be called while holding cifs_file_list_lock */
 void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 {
 	cifs_sb_active(cfile->dentry->d_sb);
-- 
cgit v1.2.3


From d3f1322af8f866be54ea79a4002fc30c98290411 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 15 Oct 2010 15:34:07 -0400
Subject: cifs: wait for writeback to complete in cifs_flush

The f_op->flush operation is the last chance to return a writeback
related error when closing a file. Ensure that we don't miss reporting
any errors by waiting for writeback to complete in cifs_flush before
proceeding.

There's no reason to do this when the file isn't open for write
however.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: David Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index baf4b5067ff9..128e07b85d6c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1688,20 +1688,13 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int rc = 0;
 
-	/* Rather than do the steps manually:
-	   lock the inode for writing
-	   loop through pages looking for write behind data (dirty pages)
-	   coalesce into contiguous 16K (or smaller) chunks to write to server
-	   send to server (prefer in parallel)
-	   deal with writebehind errors
-	   unlock inode for writing
-	   filemapfdatawrite appears easier for the time being */
-
-	rc = filemap_fdatawrite(inode->i_mapping);
-	/* reset wb rc if we were able to write out dirty pages */
-	if (!rc) {
-		rc = CIFS_I(inode)->write_behind_rc;
-		CIFS_I(inode)->write_behind_rc = 0;
+	if (file->f_mode & FMODE_WRITE) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		/* reset wb rc if we were able to write out dirty pages */
+		if (!rc) {
+			rc = CIFS_I(inode)->write_behind_rc;
+			CIFS_I(inode)->write_behind_rc = 0;
+		}
 	}
 
 	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
-- 
cgit v1.2.3


From 6c0f6218ba04a5d6e61d0c55b68e6c4ef0736531 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 23 Oct 2010 16:51:40 +0000
Subject: [CIFS] Fix checkpatch warnings and bump cifs version number

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h   | 2 +-
 fs/cifs/cifsglob.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f35795a16b42..897b2b2b28b5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -112,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.67"
+#define CIFS_VERSION   "1.68"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6319db025e7b..7128c291cec8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -74,7 +74,7 @@
  * CIFS vfs client Status information (based on what we know.)
  */
 
- /* associated with each tcp and smb session */
+/* associated with each tcp and smb session */
 enum statusEnum {
 	CifsNew = 0,
 	CifsGood,
@@ -395,7 +395,7 @@ struct cifsFileInfo {
 	struct list_head llist; /* list of byte range locks we have. */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool oplock_break_cancelled:1;
-	int count;		/* refcount -- protected by cifs_file_list_lock */
+	int count;		/* refcount protected by cifs_file_list_lock */
 	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
 	struct cifs_search_info srch_inf;
 	struct work_struct oplock_break; /* work for oplock breaks */
-- 
cgit v1.2.3


From eb4b756b1e60b66e54932619088b645c712414a3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 22 Oct 2010 14:52:29 -0400
Subject: cifs: eliminate cifsInodeInfo->write_behind_rc (try #6)

write_behind_rc is redundant and just adds complexity to the code. What
we really want to do instead is to use mapping_set_error to reset the
flags on the mapping when we find a writeback error and can't report it
to userspace yet.

For cifs_flush and cifs_fsync, we shouldn't reset the flags since errors
returned there do get reported to userspace.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |  1 -
 fs/cifs/cifsglob.h |  1 -
 fs/cifs/file.c     | 37 ++++++++++---------------------------
 fs/cifs/inode.c    | 15 +++++----------
 4 files changed, 15 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 34371637f210..e8b9523c9f39 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -318,7 +318,6 @@ cifs_alloc_inode(struct super_block *sb)
 		return NULL;
 	cifs_inode->cifsAttrs = 0x20;	/* default */
 	cifs_inode->time = 0;
-	cifs_inode->write_behind_rc = 0;
 	/* Until the file is open and we have gotten oplock
 	info back from the server, can not assume caching of
 	file data or metadata */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7128c291cec8..ef5709b250e6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -420,7 +420,6 @@ struct cifsInodeInfo {
 	struct list_head lockList;
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
-	int write_behind_rc;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
 	unsigned long time;	/* jiffies of last update/check of inode */
 	bool clientCanCacheRead:1;	/* read oplock */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 128e07b85d6c..ad47820655d9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -131,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode,
 			/* BB no need to lock inode until after invalidate
 			since namei code should already have it locked? */
 			rc = filemap_write_and_wait(inode->i_mapping);
-			if (rc != 0)
-				pCifsInode->write_behind_rc = rc;
+			mapping_set_error(inode->i_mapping, rc);
 		}
 		cFYI(1, "invalidating remote inode since open detected it "
 			 "changed");
@@ -606,8 +605,7 @@ reopen_success:
 
 	if (can_flush) {
 		rc = filemap_write_and_wait(inode->i_mapping);
-		if (rc != 0)
-			CIFS_I(inode)->write_behind_rc = rc;
+		mapping_set_error(inode->i_mapping, rc);
 
 		pCifsInode->clientCanCacheAll = false;
 		pCifsInode->clientCanCacheRead = false;
@@ -1489,12 +1487,7 @@ retry:
 			if (rc || bytes_written < bytes_to_write) {
 				cERROR(1, "Write2 ret %d, wrote %d",
 					  rc, bytes_written);
-				/* BB what if continued retry is
-				   requested via mount flags? */
-				if (rc == -ENOSPC)
-					set_bit(AS_ENOSPC, &mapping->flags);
-				else
-					set_bit(AS_EIO, &mapping->flags);
+				mapping_set_error(mapping, rc);
 			} else {
 				cifs_stats_bytes_written(tcon, bytes_written);
 			}
@@ -1639,11 +1632,10 @@ int cifs_fsync(struct file *file, int datasync)
 
 	rc = filemap_write_and_wait(inode->i_mapping);
 	if (rc == 0) {
-		rc = CIFS_I(inode)->write_behind_rc;
-		CIFS_I(inode)->write_behind_rc = 0;
+		struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
 		tcon = tlink_tcon(smbfile->tlink);
-		if (!rc && tcon && smbfile &&
-		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
 			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 	}
 
@@ -1688,14 +1680,8 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int rc = 0;
 
-	if (file->f_mode & FMODE_WRITE) {
+	if (file->f_mode & FMODE_WRITE)
 		rc = filemap_write_and_wait(inode->i_mapping);
-		/* reset wb rc if we were able to write out dirty pages */
-		if (!rc) {
-			rc = CIFS_I(inode)->write_behind_rc;
-			CIFS_I(inode)->write_behind_rc = 0;
-		}
-	}
 
 	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
 
@@ -2274,7 +2260,7 @@ void cifs_oplock_break(struct work_struct *work)
 						  oplock_break);
 	struct inode *inode = cfile->dentry->d_inode;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
-	int rc, waitrc = 0;
+	int rc = 0;
 
 	if (inode && S_ISREG(inode->i_mode)) {
 		if (cinode->clientCanCacheRead)
@@ -2283,13 +2269,10 @@ void cifs_oplock_break(struct work_struct *work)
 			break_lease(inode, O_WRONLY);
 		rc = filemap_fdatawrite(inode->i_mapping);
 		if (cinode->clientCanCacheRead == 0) {
-			waitrc = filemap_fdatawait(inode->i_mapping);
+			rc = filemap_fdatawait(inode->i_mapping);
+			mapping_set_error(inode->i_mapping, rc);
 			invalidate_remote_inode(inode);
 		}
-		if (!rc)
-			rc = waitrc;
-		if (rc)
-			cinode->write_behind_rc = rc;
 		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
 	}
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 94979309698a..39869c3c3efb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1682,8 +1682,7 @@ cifs_invalidate_mapping(struct inode *inode)
 	/* write back any cached data */
 	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
 		rc = filemap_write_and_wait(inode->i_mapping);
-		if (rc)
-			cifs_i->write_behind_rc = rc;
+		mapping_set_error(inode->i_mapping, rc);
 	}
 	invalidate_remote_inode(inode);
 	cifs_fscache_reset_inode_cookie(inode);
@@ -1943,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	 * the flush returns error?
 	 */
 	rc = filemap_write_and_wait(inode->i_mapping);
-	if (rc != 0) {
-		cifsInode->write_behind_rc = rc;
-		rc = 0;
-	}
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
 
 	if (attrs->ia_valid & ATTR_SIZE) {
 		rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -2087,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	 * the flush returns error?
 	 */
 	rc = filemap_write_and_wait(inode->i_mapping);
-	if (rc != 0) {
-		cifsInode->write_behind_rc = rc;
-		rc = 0;
-	}
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
 
 	if (attrs->ia_valid & ATTR_SIZE) {
 		rc = cifs_set_file_size(inode, attrs, xid, full_path);
-- 
cgit v1.2.3


From 6573e9b73e19c0f6b9dfa2b399267ea0f42d6c6b Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Mon, 18 Oct 2010 23:52:18 +0530
Subject: cifs: update comments - [s/GlobalSMBSesLock/cifs_file_list_lock/g]

GlobalSMBSesLock is now cifs_file_list_lock. Update comments to reflect this.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 2 +-
 fs/cifs/file.c     | 2 +-
 fs/cifs/misc.c     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ef5709b250e6..539475a11c45 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -670,7 +670,7 @@ require use of the stronger protocol */
  *  GlobalMid_Lock protects:
  *	list operations on pending_mid_q and oplockQ
  *      updates to XID counters, multiplex id  and SMB sequence numbers
- *  GlobalSMBSesLock protects:
+ *  cifs_file_list_lock protects:
  *	list operations on tcp and SMB session lists and tCon lists
  *  f_owner.lock protects certain per file struct operations
  *  mapping->page_lock protects certain per page operations
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ad47820655d9..d7c212a38386 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2291,7 +2291,7 @@ void cifs_oplock_break(struct work_struct *work)
 	/*
 	 * We might have kicked in before is_valid_oplock_break()
 	 * finished grabbing reference for us.  Make sure it's done by
-	 * waiting for GlobalSMSSeslock.
+	 * waiting for cifs_file_list_lock.
 	 */
 	spin_lock(&cifs_file_list_lock);
 	spin_unlock(&cifs_file_list_lock);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1c681f6a6803..c4e296fe3518 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -577,7 +577,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				 * cifs_oplock_break_put() can't be called
 				 * from here.  Get reference after queueing
 				 * succeeded.  cifs_oplock_break() will
-				 * synchronize using GlobalSMSSeslock.
+				 * synchronize using cifs_file_list_lock.
 				 */
 				if (queue_work(system_nrt_wq,
 					       &netfile->oplock_break))
-- 
cgit v1.2.3


From 04aadf36de625647c72ec24c7e901896dd2a99e6 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sun, 17 Oct 2010 21:56:15 +0200
Subject: jffs2: use kmemdup

Convert a sequence of kmalloc and memcpy to use kmemdup.

The semantic patch that performs this transformation is:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression a,flag,len;
expression arg,e1,e2;
statement S;
@@

  a =
-  \(kmalloc\|kzalloc\)(len,flag)
+  kmemdup(arg,len,flag)
  <... when != a
  if (a == NULL || ...) S
  ...>
- memcpy(a,arg,len+1);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/dir.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..dd745c361d7c 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	}
 
 	/* We use f->target field to store the target path. */
-	f->target = kmalloc(targetlen + 1, GFP_KERNEL);
+	f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
 	if (!f->target) {
 		printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
 		mutex_unlock(&f->sem);
@@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		goto fail;
 	}
 
-	memcpy(f->target, target, targetlen + 1);
 	D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
 
 	/* No data here. Only a metadata node, which will be
-- 
cgit v1.2.3


From a663bdd8c5d18d287f7468470816c9e0e66343c1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Oct 2010 17:17:31 -0400
Subject: nfsd4: fix connection allocation in sequence()

We're doing an allocation under a spinlock, and ignoring the
possibility of allocation failure.

A better fix wouldn't require an unnecessary allocation in the common
case, but we'll leave that for later.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ce0412fd23eb..d4aa1b59d84b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1628,33 +1628,25 @@ out:
 	return status;
 }
 
-static struct nfsd4_conn *__nfsd4_find_conn(struct svc_rqst *r, struct nfsd4_session *s)
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
 {
 	struct nfsd4_conn *c;
 
 	list_for_each_entry(c, &s->se_conns, cn_persession) {
-		if (c->cn_xprt == r->rq_xprt) {
+		if (c->cn_xprt == xpt) {
 			return c;
 		}
 	}
 	return NULL;
 }
 
-static void nfsd4_sequence_check_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
 {
 	struct nfs4_client *clp = ses->se_client;
-	struct nfsd4_conn *c, *new = NULL;
-
-	spin_lock(&clp->cl_lock);
-	c = __nfsd4_find_conn(rqstp, ses);
-	spin_unlock(&clp->cl_lock);
-	if (c)
-		return;
-
-	new = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+	struct nfsd4_conn *c;
 
 	spin_lock(&clp->cl_lock);
-	c = __nfsd4_find_conn(rqstp, ses);
+	c = __nfsd4_find_conn(new->cn_xprt, ses);
 	if (c) {
 		spin_unlock(&clp->cl_lock);
 		free_conn(new);
@@ -1674,11 +1666,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfsd4_session *session;
 	struct nfsd4_slot *slot;
+	struct nfsd4_conn *conn;
 	int status;
 
 	if (resp->opcnt != 1)
 		return nfserr_sequence_pos;
 
+	/*
+	 * Will be either used or freed by nfsd4_sequence_check_conn
+	 * below.
+	 */
+	conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+	if (!conn)
+		return nfserr_jukebox;
+
 	spin_lock(&client_lock);
 	status = nfserr_badsession;
 	session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1710,7 +1711,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status)
 		goto out;
 
-	nfsd4_sequence_check_conn(rqstp, session);
+	nfsd4_sequence_check_conn(conn, session);
+	conn = NULL;
 
 	/* Success! bump slot seqid */
 	slot->sl_inuse = true;
@@ -1726,6 +1728,7 @@ out:
 		nfsd4_get_session(cstate->session);
 		atomic_inc(&session->se_client->cl_refcount);
 	}
+	kfree(conn);
 	spin_unlock(&client_lock);
 	dprintk("%s: return %d\n", __func__, ntohl(status));
 	return status;
-- 
cgit v1.2.3


From b5ce1d83a62fc109d8e815b1fc71dcdb0d26bc49 Mon Sep 17 00:00:00 2001
From: Yoshihisa Abe <yoshiabe@cs.cmu.edu>
Date: Mon, 25 Oct 2010 02:03:44 -0400
Subject: Coda: add spin lock to protect accesses to struct coda_inode_info.

We mostly need it to protect cached user permissions. The c_flags field
is advisory, reading the wrong value is harmless and in the worst case
we hit a slow path where we have to make an extra upcall to the
userspace cache manager when revalidating a dentry or inode.

Signed-off-by: Yoshihisa Abe <yoshiabe@cs.cmu.edu>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/cache.c            | 17 ++++++++++++-----
 fs/coda/cnode.c            | 19 ++++++++++++++-----
 fs/coda/dir.c              |  6 ++++++
 fs/coda/file.c             | 12 ++++++++++--
 fs/coda/inode.c            |  2 ++
 include/linux/coda_fs_i.h  | 13 +++++++++----
 include/linux/coda_linux.h |  6 +++++-
 7 files changed, 58 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..9060f08e70cf 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
 {
 	struct coda_inode_info *cii = ITOC(inode);
 
+	spin_lock(&cii->c_lock);
 	cii->c_cached_epoch = atomic_read(&permission_epoch);
 	if (cii->c_uid != current_fsuid()) {
 		cii->c_uid = current_fsuid();
                 cii->c_cached_perm = mask;
         } else
                 cii->c_cached_perm |= mask;
+	spin_unlock(&cii->c_lock);
 }
 
 /* remove cached acl from an inode */
 void coda_cache_clear_inode(struct inode *inode)
 {
 	struct coda_inode_info *cii = ITOC(inode);
+	spin_lock(&cii->c_lock);
 	cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+	spin_unlock(&cii->c_lock);
 }
 
 /* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
 int coda_cache_check(struct inode *inode, int mask)
 {
 	struct coda_inode_info *cii = ITOC(inode);
-        int hit;
+	int hit;
 	
-        hit = (mask & cii->c_cached_perm) == mask &&
-		cii->c_uid == current_fsuid() &&
-		cii->c_cached_epoch == atomic_read(&permission_epoch);
+	spin_lock(&cii->c_lock);
+	hit = (mask & cii->c_cached_perm) == mask &&
+	    cii->c_uid == current_fsuid() &&
+	    cii->c_cached_epoch == atomic_read(&permission_epoch);
+	spin_unlock(&cii->c_lock);
 
-        return hit;
+	return hit;
 }
 
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..602240569c89 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
 static int coda_test_inode(struct inode *inode, void *data)
 {
 	struct CodaFid *fid = (struct CodaFid *)data;
-	return coda_fideq(&(ITOC(inode)->c_fid), fid);
+	struct coda_inode_info *cii = ITOC(inode);
+	return coda_fideq(&cii->c_fid, fid);
 }
 
 static int coda_set_inode(struct inode *inode, void *data)
 {
 	struct CodaFid *fid = (struct CodaFid *)data;
-	ITOC(inode)->c_fid = *fid;
+	struct coda_inode_info *cii = ITOC(inode);
+	cii->c_fid = *fid;
 	return 0;
 }
 
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
 		cii = ITOC(inode);
 		/* we still need to set i_ino for things like stat(2) */
 		inode->i_ino = hash;
+		/* inode is locked and unique, no need to grab cii->c_lock */
 		cii->c_mapcount = 0;
 		unlock_new_inode(inode);
 	}
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
 }
 
 
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
 void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
 		      struct CodaFid *newfid)
 {
-	struct coda_inode_info *cii;
+	struct coda_inode_info *cii = ITOC(inode);
 	unsigned long hash = coda_f2i(newfid);
 	
-	cii = ITOC(inode);
-
 	BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
 
 	/* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..69fbbea75f1b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
 
@@ -617,7 +618,9 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 		goto out;
 
 	/* clear the flags. */
+	spin_lock(&cii->c_lock);
 	cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+	spin_unlock(&cii->c_lock);
 
 bad:
 	unlock_kernel();
@@ -691,7 +694,10 @@ int coda_revalidate_inode(struct dentry *dentry)
 			goto return_bad;
 		
 		coda_flag_inode_children(inode, C_FLUSH);
+
+		spin_lock(&cii->c_lock);
 		cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+		spin_unlock(&cii->c_lock);
 	}
 
 ok:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..c4e395781d41 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -16,6 +16,7 @@
 #include <linux/cred.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -109,19 +110,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 
 	coda_inode = coda_file->f_path.dentry->d_inode;
 	host_inode = host_file->f_path.dentry->d_inode;
+
+	cii = ITOC(coda_inode);
+	spin_lock(&cii->c_lock);
 	coda_file->f_mapping = host_file->f_mapping;
 	if (coda_inode->i_mapping == &coda_inode->i_data)
 		coda_inode->i_mapping = host_inode->i_mapping;
 
 	/* only allow additional mmaps as long as userspace isn't changing
 	 * the container file on us! */
-	else if (coda_inode->i_mapping != host_inode->i_mapping)
+	else if (coda_inode->i_mapping != host_inode->i_mapping) {
+		spin_unlock(&cii->c_lock);
 		return -EBUSY;
+	}
 
 	/* keep track of how often the coda_inode/host_file has been mmapped */
-	cii = ITOC(coda_inode);
 	cii->c_mapcount++;
 	cfi->cfi_mapcount++;
+	spin_unlock(&cii->c_lock);
 
 	return host_file->f_op->mmap(host_file, vma);
 }
@@ -185,11 +191,13 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	cii = ITOC(coda_inode);
 
 	/* did we mmap this file? */
+	spin_lock(&cii->c_lock);
 	if (coda_inode->i_mapping == &host_inode->i_data) {
 		cii->c_mapcount -= cfi->cfi_mapcount;
 		if (!cii->c_mapcount)
 			coda_inode->i_mapping = &coda_inode->i_data;
 	}
+	spin_unlock(&cii->c_lock);
 
 	fput(cfi->cfi_container);
 	kfree(coda_file->private_data);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index bfe8179b1295..0553f3bd7b1b 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -16,6 +16,7 @@
 #include <linux/errno.h>
 #include <linux/unistd.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 	ei->c_flags = 0;
 	ei->c_uid = 0;
 	ei->c_cached_perm = 0;
+	spin_lock_init(&ei->c_lock);
 	return &ei->vfs_inode;
 }
 
diff --git a/include/linux/coda_fs_i.h b/include/linux/coda_fs_i.h
index b3ef0c461578..e35071b1de0e 100644
--- a/include/linux/coda_fs_i.h
+++ b/include/linux/coda_fs_i.h
@@ -10,19 +10,24 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/spinlock.h>
 #include <linux/coda.h>
 
 /*
  * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
  */
 struct coda_inode_info {
-        struct CodaFid	   c_fid;	/* Coda identifier */
-        u_short	           c_flags;     /* flags (see below) */
-	struct list_head   c_cilist;    /* list of all coda inodes */
+	struct CodaFid	   c_fid;	/* Coda identifier */
+	u_short	           c_flags;     /* flags (see below) */
 	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
 	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
 	vuid_t		   c_uid;	/* fsuid for cached permissions */
-        unsigned int       c_cached_perm; /* cached access permissions */
+	unsigned int       c_cached_perm; /* cached access permissions */
+	spinlock_t	   c_lock;
 	struct inode	   vfs_inode;
 };
 
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index dcc228aa335a..2e914d0771b9 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -89,7 +89,11 @@ static __inline__ char *coda_i2s(struct inode *inode)
 /* this will not zap the inode away */
 static __inline__ void coda_flag_inode(struct inode *inode, int flag)
 {
-	ITOC(inode)->c_flags |= flag;
+	struct coda_inode_info *cii = ITOC(inode);
+
+	spin_lock(&cii->c_lock);
+	cii->c_flags |= flag;
+	spin_unlock(&cii->c_lock);
 }		
 
 #endif
-- 
cgit v1.2.3


From f7cc02b8715618e179242ba9cc10bdc5146ae565 Mon Sep 17 00:00:00 2001
From: Yoshihisa Abe <yoshiabe@cs.cmu.edu>
Date: Mon, 25 Oct 2010 02:03:45 -0400
Subject: Coda: push BKL regions into coda_upcall()

Now that shared inode state is locked using the cii->c_lock, the BKL is
only used to protect the upcall queues used to communicate with the
userspace cache manager. The remaining state is all local and we can
push the lock further down into coda_upcall().

Signed-off-by: Yoshihisa Abe <yoshiabe@cs.cmu.edu>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/dir.c              | 149 +++++++++++++--------------------------------
 fs/coda/file.c             |  19 +-----
 fs/coda/inode.c            |  46 ++++++--------
 fs/coda/pioctl.c           |  22 ++-----
 fs/coda/psdev.c            |  13 +---
 fs/coda/symlink.c          |   3 -
 fs/coda/upcall.c           |  32 +++++++---
 include/linux/coda_psdev.h |   2 +-
 8 files changed, 96 insertions(+), 190 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 69fbbea75f1b..96fbeab77f2f 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,6 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
@@ -117,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		goto exit;
 	}
 
-	lock_kernel();
-
 	error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
 			     &type, &resfid);
 	if (!error)
 		error = coda_cnode_make(&inode, &resfid, dir->i_sb);
 
-	unlock_kernel();
-
 	if (error && error != -ENOENT)
 		return ERR_PTR(error);
 
@@ -141,28 +136,24 @@ exit:
 
 int coda_permission(struct inode *inode, int mask)
 {
-        int error = 0;
+	int error;
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  
 	if (!mask)
-		return 0; 
+		return 0;
 
 	if ((mask & MAY_EXEC) && !execute_ok(inode))
 		return -EACCES;
 
-	lock_kernel();
-
 	if (coda_cache_check(inode, mask))
-		goto out; 
+		return 0;
 
-        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+	error = venus_access(inode->i_sb, coda_i2f(inode), mask);
     
 	if (!error)
 		coda_cache_enter(inode, mask);
 
- out:
-	unlock_kernel();
 	return error;
 }
 
@@ -201,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 /* creation routines: create, mknod, mkdir, link, symlink */
 static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
 {
-        int error=0;
+	int error;
 	const char *name=de->d_name.name;
 	int length=de->d_name.len;
 	struct inode *inode;
 	struct CodaFid newfid;
 	struct coda_vattr attrs;
 
-	lock_kernel();
-
-	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		unlock_kernel();
+	if (coda_isroot(dir) && coda_iscontrol(name, length))
 		return -EPERM;
-	}
 
 	error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
 				0, mode, &newfid, &attrs);
-
-        if ( error ) {
-		unlock_kernel();
-		d_drop(de);
-		return error;
-	}
+	if (error)
+		goto err_out;
 
 	inode = coda_iget(dir->i_sb, &newfid, &attrs);
-	if ( IS_ERR(inode) ) {
-		unlock_kernel();
-		d_drop(de);
-		return PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
 	}
 
 	/* invalidate the directory cnode's attributes */
 	coda_dir_update_mtime(dir);
-	unlock_kernel();
 	d_instantiate(de, inode);
 	return 0;
+err_out:
+	d_drop(de);
+	return error;
 }
 
 static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -247,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
 	int error;
 	struct CodaFid newfid;
 
-	lock_kernel();
-
-	if (coda_isroot(dir) && coda_iscontrol(name, len)) {
-		unlock_kernel();
+	if (coda_isroot(dir) && coda_iscontrol(name, len))
 		return -EPERM;
-	}
 
 	attrs.va_mode = mode;
 	error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
 			       name, len, &newfid, &attrs);
-        
-        if ( error ) {
-		unlock_kernel();
-		d_drop(de);
-		return error;
-        }
+	if (error)
+		goto err_out;
          
 	inode = coda_iget(dir->i_sb, &newfid, &attrs);
-	if ( IS_ERR(inode) ) {
-		unlock_kernel();
-		d_drop(de);
-		return PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
 	}
 
 	/* invalidate the directory cnode's attributes */
 	coda_dir_inc_nlink(dir);
 	coda_dir_update_mtime(dir);
-	unlock_kernel();
 	d_instantiate(de, inode);
 	return 0;
+err_out:
+	d_drop(de);
+	return error;
 }
 
 /* try to make de an entry in dir_inodde linked to source_de */ 
@@ -288,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	int len = de->d_name.len;
 	int error;
 
-	lock_kernel();
-
-	if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-		unlock_kernel();
+	if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
 		return -EPERM;
-	}
 
 	error = venus_link(dir_inode->i_sb, coda_i2f(inode),
 			   coda_i2f(dir_inode), (const char *)name, len);
-
 	if (error) {
 		d_drop(de);
-		goto out;
+		return error;
 	}
 
 	coda_dir_update_mtime(dir_inode);
 	atomic_inc(&inode->i_count);
 	d_instantiate(de, inode);
 	inc_nlink(inode);
-
-out:
-	unlock_kernel();
-	return(error);
+	return 0;
 }
 
 
 static int coda_symlink(struct inode *dir_inode, struct dentry *de,
 			const char *symname)
 {
-        const char *name = de->d_name.name;
+	const char *name = de->d_name.name;
 	int len = de->d_name.len;
 	int symlen;
-	int error = 0;
-
-	lock_kernel();
+	int error;
 
-	if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-		unlock_kernel();
+	if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
 		return -EPERM;
-	}
 
 	symlen = strlen(symname);
-	if ( symlen > CODA_MAXPATHLEN ) {
-		unlock_kernel();
+	if (symlen > CODA_MAXPATHLEN)
 		return -ENAMETOOLONG;
-	}
 
 	/*
 	 * This entry is now negative. Since we do not create
@@ -344,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
 			      symname, symlen);
 
 	/* mtime is no good anymore */
-	if ( !error )
+	if (!error)
 		coda_dir_update_mtime(dir_inode);
 
-	unlock_kernel();
 	return error;
 }
 
@@ -358,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
 	const char *name = de->d_name.name;
 	int len = de->d_name.len;
 
-	lock_kernel();
-
 	error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
-	if ( error ) {
-		unlock_kernel();
+	if (error)
 		return error;
-	}
 
 	coda_dir_update_mtime(dir);
 	drop_nlink(de->d_inode);
-	unlock_kernel();
 	return 0;
 }
 
@@ -378,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	int len = de->d_name.len;
 	int error;
 
-	lock_kernel();
-
 	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
 	if (!error) {
 		/* VFS may delete the child */
@@ -390,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 		coda_dir_drop_nlink(dir);
 		coda_dir_update_mtime(dir);
 	}
-	unlock_kernel();
 	return error;
 }
 
@@ -404,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int new_length = new_dentry->d_name.len;
 	int error;
 
-	lock_kernel();
-
 	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
 			     coda_i2f(new_dir), old_length, new_length,
 			     (const char *) old_name, (const char *)new_name);
-
-	if ( !error ) {
-		if ( new_dentry->d_inode ) {
-			if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
+	if (!error) {
+		if (new_dentry->d_inode) {
+			if (S_ISDIR(new_dentry->d_inode->i_mode)) {
 				coda_dir_drop_nlink(old_dir);
 				coda_dir_inc_nlink(new_dir);
 			}
@@ -424,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 			coda_flag_inode(new_dir, C_VATTR);
 		}
 	}
-	unlock_kernel();
-
 	return error;
 }
 
@@ -595,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 	struct inode *inode = de->d_inode;
 	struct coda_inode_info *cii;
 
-	if (!inode)
-		return 1;
-	lock_kernel();
-	if (coda_isroot(inode))
+	if (!inode || coda_isroot(inode))
 		goto out;
 	if (is_bad_inode(inode))
 		goto bad;
@@ -621,12 +567,9 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 	spin_lock(&cii->c_lock);
 	cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
 	spin_unlock(&cii->c_lock);
-
 bad:
-	unlock_kernel();
 	return 0;
 out:
-	unlock_kernel();
 	return 1;
 }
 
@@ -659,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
 int coda_revalidate_inode(struct dentry *dentry)
 {
 	struct coda_vattr attr;
-	int error = 0;
+	int error;
 	int old_mode;
 	ino_t old_ino;
 	struct inode *inode = dentry->d_inode;
 	struct coda_inode_info *cii = ITOC(inode);
 
-	lock_kernel();
-	if ( !cii->c_flags )
-		goto ok;
+	if (!cii->c_flags)
+		return 0;
 
 	if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
 		error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
-		if ( error )
-			goto return_bad;
+		if (error)
+			return -EIO;
 
 		/* this inode may be lost if:
 		   - it's ino changed 
@@ -691,7 +633,7 @@ int coda_revalidate_inode(struct dentry *dentry)
 		/* the following can happen when a local fid is replaced 
 		   with a global one, here we lose and declare the inode bad */
 		if (inode->i_ino != old_ino)
-			goto return_bad;
+			return -EIO;
 		
 		coda_flag_inode_children(inode, C_FLUSH);
 
@@ -699,12 +641,5 @@ int coda_revalidate_inode(struct dentry *dentry)
 		cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
 		spin_unlock(&cii->c_lock);
 	}
-
-ok:
-	unlock_kernel();
 	return 0;
-
-return_bad:
-	unlock_kernel();
-	return -EIO;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c4e395781d41..c8b50ba4366a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,6 @@
 #include <linux/stat.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
@@ -144,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	if (!cfi)
 		return -ENOMEM;
 
-	lock_kernel();
-
 	error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
 			   &host_file);
 	if (!host_file)
@@ -153,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 
 	if (error) {
 		kfree(cfi);
-		unlock_kernel();
 		return error;
 	}
 
@@ -165,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 
 	BUG_ON(coda_file->private_data != NULL);
 	coda_file->private_data = cfi;
-
-	unlock_kernel();
 	return 0;
 }
 
@@ -177,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	struct coda_file_info *cfi;
 	struct coda_inode_info *cii;
 	struct inode *host_inode;
-	int err = 0;
-
-	lock_kernel();
+	int err;
 
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -203,8 +195,6 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	kfree(coda_file->private_data);
 	coda_file->private_data = NULL;
 
-	unlock_kernel();
-
 	/* VFS fput ignores the return value from file_operations->release, so
 	 * there is no use returning an error here */
 	return 0;
@@ -215,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
 	struct file *host_file;
 	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
 	struct coda_file_info *cfi;
-	int err = 0;
+	int err;
 
 	if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
 	      S_ISLNK(coda_inode->i_mode)))
@@ -226,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
 	host_file = cfi->cfi_container;
 
 	err = vfs_fsync(host_file, datasync);
-	if ( !err && !datasync ) {
-		lock_kernel();
+	if (!err && !datasync)
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-		unlock_kernel();
-	}
 
 	return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 0553f3bd7b1b..b7fa3e3d772f 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -150,8 +150,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	int error;
 	int idx;
 
-	lock_kernel();
-
 	idx = get_device_index((struct coda_mount_data *) data);
 
 	/* Ignore errors in data, for backward compatibility */
@@ -161,23 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
 
 	vc = &coda_comms[idx];
+	lock_kernel();
+
 	if (!vc->vc_inuse) {
 		printk("coda_read_super: No pseudo device\n");
-		unlock_kernel();
-		return -EINVAL;
+		error = -EINVAL;
+		goto unlock_out;
 	}
 
-        if ( vc->vc_sb ) {
+	if (vc->vc_sb) {
 		printk("coda_read_super: Device already mounted\n");
-		unlock_kernel();
-		return -EBUSY;
+		error = -EBUSY;
+		goto unlock_out;
 	}
 
 	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
 	if (error)
-		goto bdi_err;
+		goto unlock_out;
 
 	vc->vc_sb = sb;
+	unlock_kernel();
 
 	sb->s_fs_info = vc;
 	sb->s_flags |= MS_NOATIME;
@@ -206,21 +207,23 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk("coda_read_super: rootinode is %ld dev %s\n", 
 	       root->i_ino, root->i_sb->s_id);
 	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root)
+	if (!sb->s_root) {
+		error = -EINVAL;
 		goto error;
-	unlock_kernel();
+	}
 	return 0;
 
- error:
-	bdi_destroy(&vc->bdi);
- bdi_err:
+error:
 	if (root)
 		iput(root);
-	if (vc)
-		vc->vc_sb = NULL;
 
+	lock_kernel();
+	bdi_destroy(&vc->bdi);
+	vc->vc_sb = NULL;
+	sb->s_fs_info = NULL;
+unlock_out:
 	unlock_kernel();
-	return -EINVAL;
+	return error;
 }
 
 static void coda_put_super(struct super_block *sb)
@@ -253,8 +256,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
 	struct coda_vattr vattr;
 	int error;
 
-	lock_kernel();
-	
 	memset(&vattr, 0, sizeof(vattr)); 
 
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -264,13 +265,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
 	/* Venus is responsible for truncating the container-file!!! */
 	error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
 
-	if ( !error ) {
+	if (!error) {
 	        coda_vattr_to_iattr(inode, &vattr); 
 		coda_cache_clear_inode(inode);
 	}
-
-	unlock_kernel();
-
 	return error;
 }
 
@@ -284,12 +282,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
-	lock_kernel();
-
 	error = venus_statfs(dentry, buf);
 
-	unlock_kernel();
-
 	if (error) {
 		/* fake something like AFS does */
 		buf->f_blocks = 9000000;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 028a9a0f588b..2fd89b5c5c7b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
-#include <linux/smp_lock.h>
-
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -58,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 	struct inode *target_inode = NULL;
 	struct coda_inode_info *cnp;
 
-	lock_kernel();
-
 	/* get the Pioctl data arguments from user space */
-	if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-		error = -EINVAL;
-		goto out;
-	}
+	if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
+		return -EINVAL;
 
 	/*
 	 * Look up the pathname. Note that the pathname is in
@@ -76,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 		error = user_lpath(data.path, &path);
 
 	if (error)
-		goto out;
-	else
-		target_inode = path.dentry->d_inode;
+		return error;
+
+	target_inode = path.dentry->d_inode;
 
 	/* return if it is not a Coda inode */
 	if (target_inode->i_sb != inode->i_sb) {
-		path_put(&path);
 		error = -EINVAL;
 		goto out;
 	}
@@ -91,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 	cnp = ITOC(target_inode);
 
 	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
-
-	path_put(&path);
-
 out:
-	unlock_kernel();
+	path_put(&path);
 	return error;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index fdc2f3ef7ecd..9a9248e632c6 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -108,16 +108,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	        return -EFAULT;
 
         if (DOWNCALL(hdr.opcode)) {
-		struct super_block *sb = NULL;
-                union outputArgs *dcbuf;
+		union outputArgs *dcbuf;
 		int size = sizeof(*dcbuf);
 
-		sb = vcp->vc_sb;
-		if ( !sb ) {
-                        count = nbytes;
-                        goto out;
-		}
-
 		if  ( nbytes < sizeof(struct coda_out_hdr) ) {
 		        printk("coda_downcall opc %d uniq %d, not enough!\n",
 			       hdr.opcode, hdr.unique);
@@ -137,9 +130,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		}
 
 		/* what downcall errors does Venus handle ? */
-		lock_kernel();
-		error = coda_downcall(hdr.opcode, dcbuf, sb);
-		unlock_kernel();
+		error = coda_downcall(vcp, hdr.opcode, dcbuf);
 
 		CODA_FREE(dcbuf, nbytes);
 		if (error) {
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..af78f007a2b0 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 	unsigned int len = PAGE_SIZE;
 	char *p = kmap(page);
 
-	lock_kernel();
 	cii = ITOC(inode);
 
 	error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
-	unlock_kernel();
 	if (error)
 		goto fail;
 	SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..4c258cb5266d 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
@@ -667,18 +668,23 @@ static int coda_upcall(struct venus_comm *vcp,
 {
 	union outputArgs *out;
 	union inputArgs *sig_inputArgs;
-	struct upc_req *req, *sig_req;
-	int error = 0;
+	struct upc_req *req = NULL, *sig_req;
+	int error;
+
+	lock_kernel();
 
 	if (!vcp->vc_inuse) {
 		printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
-		return -ENXIO;
+		error = -ENXIO;
+		goto exit;
 	}
 
 	/* Format the request message. */
 	req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
-	if (!req)
-		return -ENOMEM;
+	if (!req) {
+		error = -ENOMEM;
+		goto exit;
+	}
 
 	req->uc_data = (void *)buffer;
 	req->uc_flags = 0;
@@ -759,6 +765,7 @@ static int coda_upcall(struct venus_comm *vcp,
 
 exit:
 	kfree(req);
+	unlock_kernel();
 	return error;
 }
 
@@ -796,21 +803,24 @@ exit:
  *
  * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
 
-int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 {
 	struct inode *inode = NULL;
 	struct CodaFid *fid, *newfid;
+	struct super_block *sb;
 
 	/* Handle invalidation requests. */
-	if ( !sb || !sb->s_root)
-		return 0;
+	lock_kernel();
+	sb = vcp->vc_sb;
+	if (!sb || !sb->s_root)
+		goto unlock_out;
 
 	switch (opcode) {
 	case CODA_FLUSH:
 		coda_cache_clear_all(sb);
 		shrink_dcache_sb(sb);
 		if (sb->s_root->d_inode)
-		    coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+			coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
 		break;
 
 	case CODA_PURGEUSER:
@@ -855,9 +865,11 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
 		break;
 	}
 
+unlock_out:
+	unlock_kernel();
+
 	if (inode)
 		iput(inode);
-
 	return 0;
 }
 
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 284b520934a0..1e60c5a41a5b 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -63,7 +63,7 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid,
 int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
 int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
 int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
-- 
cgit v1.2.3


From da47c19e5c746829042933c8f945a71e2b62d6fc Mon Sep 17 00:00:00 2001
From: Yoshihisa Abe <yoshiabe@cs.cmu.edu>
Date: Mon, 25 Oct 2010 02:03:46 -0400
Subject: Coda: replace BKL with mutex

Replace the BKL with a mutex to protect the venus_comm structure which
binds the mountpoint with the character device and holds the upcall
queues.

Signed-off-by: Yoshihisa Abe <yoshiabe@cs.cmu.edu>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/inode.c            | 19 +++++++------
 fs/coda/psdev.c            | 28 +++++++++++-------
 fs/coda/upcall.c           | 71 ++++++++++++++++++++++++++--------------------
 include/linux/coda_psdev.h |  2 ++
 4 files changed, 70 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b7fa3e3d772f..7993b96ca348 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
@@ -145,7 +145,7 @@ static int get_device_index(struct coda_mount_data *data)
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *root = NULL;
-	struct venus_comm *vc = NULL;
+	struct venus_comm *vc;
 	struct CodaFid fid;
 	int error;
 	int idx;
@@ -159,7 +159,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
 
 	vc = &coda_comms[idx];
-	lock_kernel();
+	mutex_lock(&vc->vc_mutex);
 
 	if (!vc->vc_inuse) {
 		printk("coda_read_super: No pseudo device\n");
@@ -178,7 +178,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		goto unlock_out;
 
 	vc->vc_sb = sb;
-	unlock_kernel();
+	mutex_unlock(&vc->vc_mutex);
 
 	sb->s_fs_info = vc;
 	sb->s_flags |= MS_NOATIME;
@@ -217,20 +217,23 @@ error:
 	if (root)
 		iput(root);
 
-	lock_kernel();
+	mutex_lock(&vc->vc_mutex);
 	bdi_destroy(&vc->bdi);
 	vc->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 unlock_out:
-	unlock_kernel();
+	mutex_unlock(&vc->vc_mutex);
 	return error;
 }
 
 static void coda_put_super(struct super_block *sb)
 {
-	bdi_destroy(&coda_vcp(sb)->bdi);
-	coda_vcp(sb)->vc_sb = NULL;
+	struct venus_comm *vcp = coda_vcp(sb);
+	mutex_lock(&vcp->vc_mutex);
+	bdi_destroy(&vcp->bdi);
+	vcp->vc_sb = NULL;
 	sb->s_fs_info = NULL;
+	mutex_unlock(&vcp->vc_mutex);
 
 	printk("Coda: Bye bye.\n");
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 9a9248e632c6..62647a8595e4 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/device.h>
 #include <asm/io.h>
 #include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
 	unsigned int mask = POLLOUT | POLLWRNORM;
 
 	poll_wait(file, &vcp->vc_waitq, wait);
+	mutex_lock(&vcp->vc_mutex);
 	if (!list_empty(&vcp->vc_pending))
                 mask |= POLLIN | POLLRDNORM;
+	mutex_unlock(&vcp->vc_mutex);
 
 	return mask;
 }
@@ -143,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	}
         
 	/* Look for the message on the processing queue. */
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 	list_for_each(lh, &vcp->vc_processing) {
 		tmp = list_entry(lh, struct upc_req , uc_chain);
 		if (tmp->uc_unique == hdr.unique) {
@@ -152,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 			break;
 		}
 	}
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 
 	if (!req) {
 		printk("psdev_write: msg (%d, %d) not found\n", 
@@ -207,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	if (nbytes == 0)
 		return 0;
 
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 
 	add_wait_queue(&vcp->vc_waitq, &wait);
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -221,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 			retval = -ERESTARTSYS;
 			break;
 		}
+		mutex_unlock(&vcp->vc_mutex);
 		schedule();
+		mutex_lock(&vcp->vc_mutex);
 	}
 
 	set_current_state(TASK_RUNNING);
@@ -254,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
 	kfree(req);
 out:
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return (count ? count : retval);
 }
 
@@ -267,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
 	if (idx < 0 || idx >= MAX_CODADEVS)
 		return -ENODEV;
 
-	lock_kernel();
-
 	err = -EBUSY;
 	vcp = &coda_comms[idx];
+	mutex_lock(&vcp->vc_mutex);
+
 	if (!vcp->vc_inuse) {
 		vcp->vc_inuse++;
 
@@ -284,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
 		err = 0;
 	}
 
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return err;
 }
 
@@ -299,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 		return -1;
 	}
 
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 
 	/* Wakeup clients so they can return. */
 	list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -324,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 
 	file->private_data = NULL;
 	vcp->vc_inuse--;
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return 0;
 }
 
@@ -353,9 +357,11 @@ static int init_coda_psdev(void)
 		err = PTR_ERR(coda_psdev_class);
 		goto out_chrdev;
 	}		
-	for (i = 0; i < MAX_CODADEVS; i++)
+	for (i = 0; i < MAX_CODADEVS; i++) {
+		mutex_init(&(&coda_comms[i])->vc_mutex);
 		device_create(coda_psdev_class, NULL,
 			      MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+	}
 	coda_sysctl_init();
 	goto out;
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 4c258cb5266d..c3563cab9758 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,7 +27,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
@@ -607,7 +607,8 @@ static void coda_unblock_signals(sigset_t *old)
 				 (r)->uc_opcode != CODA_RELEASE) || \
 				(r)->uc_flags & CODA_REQ_READ))
 
-static inline void coda_waitfor_upcall(struct upc_req *req)
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+				       struct upc_req *req)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -640,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
 			break;
 		}
 
+		mutex_unlock(&vcp->vc_mutex);
 		if (blocked)
 			schedule_timeout(HZ);
 		else
 			schedule();
+		mutex_lock(&vcp->vc_mutex);
 	}
 	if (blocked)
 		coda_unblock_signals(&old);
@@ -671,7 +674,7 @@ static int coda_upcall(struct venus_comm *vcp,
 	struct upc_req *req = NULL, *sig_req;
 	int error;
 
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 
 	if (!vcp->vc_inuse) {
 		printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
@@ -711,7 +714,7 @@ static int coda_upcall(struct venus_comm *vcp,
 	 * ENODEV.  */
 
 	/* Go to sleep.  Wake up on signals only after the timeout. */
-	coda_waitfor_upcall(req);
+	coda_waitfor_upcall(vcp, req);
 
 	/* Op went through, interrupt or not... */
 	if (req->uc_flags & CODA_REQ_WRITE) {
@@ -765,7 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
 
 exit:
 	kfree(req);
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return error;
 }
 
@@ -806,11 +809,11 @@ exit:
 int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 {
 	struct inode *inode = NULL;
-	struct CodaFid *fid, *newfid;
+	struct CodaFid *fid = NULL, *newfid;
 	struct super_block *sb;
 
 	/* Handle invalidation requests. */
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 	sb = vcp->vc_sb;
 	if (!sb || !sb->s_root)
 		goto unlock_out;
@@ -829,47 +832,53 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 
 	case CODA_ZAPDIR:
 		fid = &out->coda_zapdir.CodaFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode) {
-			coda_flag_inode_children(inode, C_PURGE);
-			coda_flag_inode(inode, C_VATTR);
-		}
 		break;
 
 	case CODA_ZAPFILE:
 		fid = &out->coda_zapfile.CodaFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode)
-			coda_flag_inode(inode, C_VATTR);
 		break;
 
 	case CODA_PURGEFID:
 		fid = &out->coda_purgefid.CodaFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode) {
-			coda_flag_inode_children(inode, C_PURGE);
-
-			/* catch the dentries later if some are still busy */
-			coda_flag_inode(inode, C_PURGE);
-			d_prune_aliases(inode);
-
-		}
 		break;
 
 	case CODA_REPLACE:
 		fid = &out->coda_replace.OldFid;
-		newfid = &out->coda_replace.NewFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode)
-			coda_replace_fid(inode, fid, newfid);
 		break;
 	}
+	if (fid)
+		inode = coda_fid_to_inode(fid, sb);
 
 unlock_out:
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
+
+	if (!inode)
+		return 0;
+
+	switch (opcode) {
+	case CODA_ZAPDIR:
+		coda_flag_inode_children(inode, C_PURGE);
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_ZAPFILE:
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_PURGEFID:
+		coda_flag_inode_children(inode, C_PURGE);
 
-	if (inode)
-		iput(inode);
+		/* catch the dentries later if some are still busy */
+		coda_flag_inode(inode, C_PURGE);
+		d_prune_aliases(inode);
+		break;
+
+	case CODA_REPLACE:
+		newfid = &out->coda_replace.NewFid;
+		coda_replace_fid(inode, fid, newfid);
+		break;
+	}
+	iput(inode);
 	return 0;
 }
 
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 1e60c5a41a5b..72f2d2f0af91 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -8,6 +8,7 @@
 
 #ifdef __KERNEL__
 #include <linux/backing-dev.h>
+#include <linux/mutex.h>
 
 struct kstatfs;
 
@@ -20,6 +21,7 @@ struct venus_comm {
 	int                 vc_inuse;
 	struct super_block *vc_sb;
 	struct backing_dev_info bdi;
+	struct mutex	    vc_mutex;
 };
 
 
-- 
cgit v1.2.3


From 571f7f46bf367b29f574ca51a9ca1db5035602bb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 21 Oct 2010 22:17:17 -0700
Subject: fs/exofs: typo fix of faild to failed

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/dir.c   |  4 ++--
 fs/exofs/inode.c | 14 +++++++-------
 fs/exofs/ios.c   | 10 +++++-----
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
 	err = exofs_write_begin(NULL, page->mapping, pos, len,
 				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
 	if (err)
-		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
 			  err);
 
 	de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
 	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
 							&page, NULL);
 	if (err)
-		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
 			  err);
 	if (pde)
 		pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0ba9886da2ec..f5f8ce03c2f3 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -505,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
 	if (!pcol_copy) {
-		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+		EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
 		ret = -ENOMEM;
 		goto err;
 	}
@@ -521,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
 
 	ret = exofs_oi_write(oi, ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
+		EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
 		goto err;
 	}
 
@@ -622,7 +622,7 @@ try_again:
 		/* split the request, next loop will start again */
 		ret = write_exec(pcol);
 		if (unlikely(ret)) {
-			EXOFS_DBGMSG("write_exec faild => %d", ret);
+			EXOFS_DBGMSG("write_exec failed => %d", ret);
 			goto fail;
 		}
 
@@ -713,7 +713,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
 					 fsdata);
 		if (ret) {
-			EXOFS_DBGMSG("simple_write_begin faild\n");
+			EXOFS_DBGMSG("simple_write_begin failed\n");
 			goto out;
 		}
 
@@ -726,7 +726,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		if (ret) {
 			/*SetPageError was done by _readpage. Is it ok?*/
 			unlock_page(page);
-			EXOFS_DBGMSG("__readpage_filler faild\n");
+			EXOFS_DBGMSG("__readpage_filler failed\n");
 		}
 	}
 out:
@@ -1090,7 +1090,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
 	atomic_dec(&sbi->s_curr_pending);
 
 	if (unlikely(ret)) {
-		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+		EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
 			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
 		/*TODO: When FS is corrupted creation can fail, object already
 		 * exist. Get rid of this asynchronous creation, if exist
@@ -1211,7 +1211,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
 	if (!args) {
-		EXOFS_DBGMSG("Faild kzalloc of args\n");
+		EXOFS_DBGMSG("Failed kzalloc of args\n");
 		return -ENOMEM;
 	}
 
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6550bf70e41d..f74a2ec027a6 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 
 	ret = osd_finalize_request(or, 0, cred, NULL);
 	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
 		goto out;
 	}
 
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
 	 */
 	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
-		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+		EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
 			     exofs_io_state_size(layout->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
 
 		ret = osd_finalize_request(or, 0, ios->cred, NULL);
 		if (unlikely(ret)) {
-			EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+			EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
 				     ret);
 			return ret;
 		}
@@ -361,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
-			EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+			EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
 				     bio_size);
 			return -ENOMEM;
 		}
@@ -564,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 						  master_dev->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
 					EXOFS_DBGMSG(
-					      "Faild to allocate BIO size=%u\n",
+					      "Failed to allocate BIO size=%u\n",
 					      master_dev->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
-- 
cgit v1.2.3


From fe2fd9ed5bf184f797412be8b86f4589d1b77bb8 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sat, 16 Oct 2010 19:14:01 +1100
Subject: exofs: Remove inode->i_count manipulation in exofs_new_inode

exofs_new_inode() was incrementing the inode->i_count and
decrementing it in create_done(), in a bad attempt to make sure
the inode will still be there when the asynchronous create_done()
finally arrives. This was very stupid because iput() was not called,
and if it was actually needed, it would leak the inode.

However all this is not needed, because at exofs_evict_inode()
we already wait for create_done() by waiting for the
object_created event. Therefore remove the superfluous ref counting
and just Thicken the comment at exofs_evict_inode() a bit.

While at it change places that open coded wait_obj_created()
to call the already available wrapper.

CC: Dave Chinner <dchinner@redhat.com>
CC: Christoph Hellwig <hch@lst.de>
CC: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f5f8ce03c2f3..42685424817b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1067,8 +1067,10 @@ bad_inode:
 int __exofs_wait_obj_created(struct exofs_i_info *oi)
 {
 	if (!obj_created(oi)) {
+		EXOFS_DBGMSG("!obj_created\n");
 		BUG_ON(!obj_2bcreated(oi));
 		wait_event(oi->i_wq, obj_created(oi));
+		EXOFS_DBGMSG("wait_event done\n");
 	}
 	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
@@ -1102,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
 
 	set_obj_created(oi);
 
-	atomic_dec(&inode->i_count);
 	wake_up(&oi->i_wq);
 }
 
@@ -1153,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	ios->obj.id = exofs_oi_objno(oi);
 	exofs_make_credential(oi->i_cred, &ios->obj);
 
-	/* increment the refcount so that the inode will still be around when we
-	 * reach the callback
-	 */
-	atomic_inc(&inode->i_count);
-
 	ios->done = create_done;
 	ios->private = inode;
 	ios->cred = oi->i_cred;
 	ret = exofs_sbi_create(ios);
 	if (ret) {
-		atomic_dec(&inode->i_count);
 		exofs_put_io_state(ios);
 		return ERR_PTR(ret);
 	}
@@ -1253,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	ios->out_attr_len = 1;
 	ios->out_attr = &attr;
 
-	if (!obj_created(oi)) {
-		EXOFS_DBGMSG("!obj_created\n");
-		BUG_ON(!obj_2bcreated(oi));
-		wait_event(oi->i_wq, obj_created(oi));
-		EXOFS_DBGMSG("wait_event done\n");
-	}
+	wait_obj_created(oi);
 
 	if (!do_sync) {
 		args->sbi = sbi;
@@ -1321,12 +1311,12 @@ void exofs_evict_inode(struct inode *inode)
 	inode->i_size = 0;
 	end_writeback(inode);
 
-	/* if we are deleting an obj that hasn't been created yet, wait */
-	if (!obj_created(oi)) {
-		BUG_ON(!obj_2bcreated(oi));
-		wait_event(oi->i_wq, obj_created(oi));
-		/* ignore the error attempt a remove anyway */
-	}
+	/* if we are deleting an obj that hasn't been created yet, wait.
+	 * This also makes sure that create_done cannot be called with an
+	 * already evicted inode.
+	 */
+	wait_obj_created(oi);
+	/* ignore the error, attempt a remove anyway */
 
 	/* Now Remove the OSD objects */
 	ret = exofs_get_io_state(&sbi->layout, &ios);
-- 
cgit v1.2.3


From e50fb58b5b3548e578d3b74ff15aeb7d9a496839 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 25 Oct 2010 20:39:07 +0200
Subject: hfsplus: fix double lock typo in ioctl

This was supposed to be a mutex_unlock() instead of a mutex_lock().

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5b4667e08ef7..40a85a3ded6e 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -92,7 +92,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	mark_inode_dirty(inode);
 
 out_unlock_inode:
-	mutex_lock(&inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 out_drop_write:
 	mnt_drop_write(file->f_path.mnt);
 out:
-- 
cgit v1.2.3


From 81fca444001e5a41ab80ce8cf9a5734c00ec6546 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 10:47:47 +0200
Subject: fs: move permission check back into __lookup_hash

The caller that didn't need it is gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..f1ef97dbc6c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 static struct dentry *__lookup_hash(struct qstr *name,
 		struct dentry *base, struct nameidata *nd)
 {
+	struct inode *inode = base->d_inode;
 	struct dentry *dentry;
-	struct inode *inode;
 	int err;
 
-	inode = base->d_inode;
+	err = exec_permission(inode);
+	if (err)
+		return ERR_PTR(err);
 
 	/*
 	 * See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@ out:
  */
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
-	int err;
-
-	err = exec_permission(nd->path.dentry->d_inode);
-	if (err)
-		return ERR_PTR(err);
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
 
@@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	if (err)
 		return ERR_PTR(err);
 
-	err = exec_permission(base->d_inode);
-	if (err)
-		return ERR_PTR(err);
 	return __lookup_hash(&this, base, NULL);
 }
 
-- 
cgit v1.2.3


From c37650161a53c01ddd88587675f9a4adc909a73e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 10:48:20 +0200
Subject: fs: add sync_inode_metadata

Add a new helper to write out the inode using the writeback code,
that is including the correct dirty bit and list manipulation.  A few
of filesystems already opencode this, and a lot of others should be
using it instead of using write_inode_now which also writes out the
data.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/pohmelfs/inode.c |  6 +-----
 fs/exofs/file.c                  |  6 +-----
 fs/ext2/dir.c                    |  2 +-
 fs/ext2/ext2.h                   |  1 -
 fs/ext2/inode.c                  | 11 +----------
 fs/ext2/xattr.c                  |  2 +-
 fs/fs-writeback.c                | 20 ++++++++++++++++++++
 fs/libfs.c                       |  6 +-----
 fs/nfsd/vfs.c                    | 16 +++-------------
 include/linux/fs.h               |  1 +
 10 files changed, 30 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 97dae297ca3c..c62d30017c07 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -882,12 +882,8 @@ static struct inode *pohmelfs_alloc_inode(struct super_block *sb)
 static int pohmelfs_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0,	/* sys_fsync did this */
-	};
 
-	return sync_inode(inode, &wbc);
+	return sync_inode_metadata(inode, 1);
 }
 
 ssize_t pohmelfs_write(struct file *file, const char __user *buf,
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 {
 	int ret;
 	struct inode *inode = filp->f_mapping->host;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0, /* metadata-only; caller takes care of data */
-	};
 	struct super_block *sb;
 
 	if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		return 0;
 
-	ret = sync_inode(inode, &wbc);
+	ret = sync_inode_metadata(inode, 1);
 
 	/* This is a good place to write the sb */
 	/* TODO: Sechedule an sb-sync on create */
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..2709b34206ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 	if (IS_DIRSYNC(dir)) {
 		err = write_one_page(page, 1);
 		if (!err)
-			err = ext2_sync_inode(dir);
+			err = sync_inode_metadata(dir, 1);
 	} else {
 		unlock_page(page);
 	}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
-extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 533699c16040..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
-		ext2_sync_inode (inode);
+		sync_inode_metadata(inode, 1);
 	} else {
 		mark_inode_dirty(inode);
 	}
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
-int ext2_sync_inode(struct inode *inode)
-{
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0,	/* sys_fsync did this */
-	};
-	return sync_inode(inode, &wbc);
-}
-
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..f84700be3274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 	EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 	inode->i_ctime = CURRENT_TIME_SEC;
 	if (IS_SYNC(inode)) {
-		error = ext2_sync_inode (inode);
+		error = sync_inode_metadata(inode, 1);
 		/* In case sync failed due to ENOSPC the inode was actually
 		 * written (only some dirty data were not) so we just proceed
 		 * as if nothing happened and cleanup the unused block */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..29e3f409bbd0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1198,3 +1198,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+	struct writeback_control wbc = {
+		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.nr_to_write = 0, /* metadata-only */
+	};
+
+	return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/libfs.c b/fs/libfs.c
index 62baa0387d6e..2dbf4877d7ef 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -892,10 +892,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
  */
 int generic_file_fsync(struct file *file, int datasync)
 {
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0, /* metadata-only; caller takes care of data */
-	};
 	struct inode *inode = file->f_mapping->host;
 	int err;
 	int ret;
@@ -906,7 +902,7 @@ int generic_file_fsync(struct file *file, int datasync)
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		return ret;
 
-	err = sync_inode(inode, &wbc);
+	err = sync_inode_metadata(inode, 1);
 	if (ret == 0)
 		ret = err;
 	return ret;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..184938fcff04 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
 {
 	struct inode *inode = fhp->fh_dentry->d_inode;
 	const struct export_operations *export_ops = inode->i_sb->s_export_op;
-	int error = 0;
 
 	if (!EX_ISSYNC(fhp->fh_export))
 		return 0;
 
-	if (export_ops->commit_metadata) {
-		error = export_ops->commit_metadata(inode);
-	} else {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 0, /* metadata only */
-		};
-
-		error = sync_inode(inode, &wbc);
-	}
-
-	return error;
+	if (export_ops->commit_metadata)
+		return export_ops->commit_metadata(inode);
+	return sync_inode_metadata(inode, 1);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f34ff6e5558..0b03f490572f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1734,6 +1734,7 @@ static inline void file_accessed(struct file *file)
 }
 
 int sync_inode(struct inode *inode, struct writeback_control *wbc);
+int sync_inode_metadata(struct inode *inode, int wait);
 
 struct file_system_type {
 	const char *name;
-- 
cgit v1.2.3


From 56b0dacfa2b8416815a2f2a5f4f51e46be4cf14c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 10:48:55 +0200
Subject: fs: mark destroy_inode static

Hugetlbfs used to need it, but after the destroy_inode and evict_inode
changes it's not required anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 2 +-
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..3368abd64bb5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -235,7 +235,7 @@ void __destroy_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(__destroy_inode);
 
-void destroy_inode(struct inode *inode)
+static void destroy_inode(struct inode *inode)
 {
 	__destroy_inode(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0b03f490572f..0a5d83633884 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2187,7 +2187,6 @@ extern void unlock_new_inode(struct inode *);
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
 extern void end_writeback(struct inode *);
-extern void destroy_inode(struct inode *);
 extern void __destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
 extern int should_remove_suid(struct dentry *);
-- 
cgit v1.2.3


From ebdec241d509cf69f6ebf1ecdc036359d3dbe154 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 10:47:23 +0200
Subject: fs: kill block_prepare_write

__block_write_begin and block_prepare_write are identical except for slightly
different calling conventions.  Convert all callers to the __block_write_begin
calling conventions and drop block_prepare_write.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                  | 17 +++++------------
 fs/ext3/inode.c              |  4 ++--
 fs/ext4/inode.c              | 11 +++++------
 fs/gfs2/aops.c               |  3 +--
 fs/gfs2/ops_inode.c          |  6 +++---
 fs/ocfs2/aops.c              | 19 ++-----------------
 fs/ocfs2/aops.h              |  3 ---
 fs/ocfs2/file.c              |  9 ++++-----
 fs/reiserfs/inode.c          | 24 +++++++++++-------------
 fs/reiserfs/ioctl.c          |  6 ++----
 fs/reiserfs/xattr.c          |  5 +----
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 include/linux/buffer_head.h  |  1 -
 include/linux/reiserfs_fs.h  |  2 ++
 14 files changed, 39 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 7f0b9b083f77..a7b8f3c59a4e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1834,9 +1834,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block)
 {
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
 	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end;
 	sector_t block;
@@ -1916,7 +1918,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
 	}
 	return err;
 }
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to)
@@ -1953,15 +1955,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 	return 0;
 }
 
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block)
-{
-	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-
-	return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
-
 /*
  * block_write_begin takes care of the basic task of block allocation and
  * bringing partial write blocks uptodate first.
@@ -2379,7 +2372,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	else
 		end = PAGE_CACHE_SIZE;
 
-	ret = block_prepare_write(page, 0, end, get_block);
+	ret = __block_write_begin(page, 0, end, get_block);
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..ad05353040a1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1696,8 +1696,8 @@ static int ext3_journalled_writepage(struct page *page,
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext3_get_block);
+		ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+					  ext3_get_block);
 		if (ret != 0) {
 			ext3_journal_stop(handle);
 			goto out_unlock;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..49635ef236f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1538,10 +1538,10 @@ static int do_journal_get_write_access(handle_t *handle,
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	/*
-	 * __block_prepare_write() could have dirtied some buffers. Clean
+	 * __block_write_begin() could have dirtied some buffers. Clean
 	 * the dirty bit as jbd2_journal_get_write_access() could complain
 	 * otherwise about fs integrity issues. Setting of the dirty bit
-	 * by __block_prepare_write() isn't a real problem here as we clear
+	 * by __block_write_begin() isn't a real problem here as we clear
 	 * the bit before releasing a page lock and thus writeback cannot
 	 * ever write the buffer.
 	 */
@@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		if (buffer_delay(bh))
 			return 0; /* Not sure this could or should happen */
 		/*
-		 * XXX: __block_prepare_write() unmaps passed block,
-		 * is it OK?
+		 * XXX: __block_write_begin() unmaps passed block, is it OK?
 		 */
 		ret = ext4_da_reserve_space(inode, iblock);
 		if (ret)
@@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
  * This function is used as a standard get_block_t calback function
  * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
  * These functions should only try to map a single block at a time.
  *
  * Since this function doesn't do block allocations even if the caller
@@ -2743,7 +2742,7 @@ static int ext4_writepage(struct page *page,
 		 * all are mapped and non delay. We don't want to
 		 * do block allocation here.
 		 */
-		ret = block_prepare_write(page, 0, len,
+		ret = __block_write_begin(page, 0, len,
 					  noalloc_get_block_write);
 		if (!ret) {
 			page_bufs = page_buffers(page);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b24afb96aae..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	struct gfs2_alloc *al = NULL;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-	unsigned to = from + len;
 	struct page *page;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 prepare_write:
-	error = block_prepare_write(page, from, to, gfs2_block_map);
+	error = __block_write_begin(page, from, len, gfs2_block_map);
 out:
 	if (error == 0)
 		return 0;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0534510200d5..48a274f1674c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
 	int error;
 
 	if (!page_has_buffers(page)) {
-		error = block_prepare_write(page, from, to, gfs2_block_map);
+		error = __block_write_begin(page, from, to - from, gfs2_block_map);
 		if (unlikely(error))
 			return error;
 
@@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
 		next += bh->b_size;
 		if (buffer_mapped(bh)) {
 			if (end) {
-				error = block_prepare_write(page, start, end,
+				error = __block_write_begin(page, start, end - start,
 							    gfs2_block_map);
 				if (unlikely(error))
 					return error;
@@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
 	} while (next < to);
 
 	if (end) {
-		error = block_prepare_write(page, start, end, gfs2_block_map);
+		error = __block_write_begin(page, start, end - start, gfs2_block_map);
 		if (unlikely(error))
 			return error;
 		empty_write_end(page, start, end);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5cfeee118158..f1e962cb3b73 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	 * ocfs2 never allocates in this function - the only time we
 	 * need to use BH_New is when we're extending i_size on a file
 	 * system which doesn't support holes, in which case BH_New
-	 * allows block_prepare_write() to zero.
+	 * allows __block_write_begin() to zero.
 	 *
 	 * If we see this on a sparse file system, then a truncate has
 	 * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 	return ret;
 }
 
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-			       unsigned from, unsigned to)
-{
-	int ret;
-
-	ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-	return ret;
-}
-
 /* Taken from ext3. We don't necessarily need the full blown
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 }
 
 /*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
  * mapping by now though, and the entire write will be allocating or
  * it won't, so not much need to use BH_New.
  *
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 7606f663da6d..76bfdfda691a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-			       unsigned from, unsigned to);
-
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1ca6867935bb..77b4c04a2809 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -796,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 		block_end = block_start + (1 << inode->i_blkbits);
 
 		/*
-		 * block_start is block-aligned.  Bump it by one to
-		 * force ocfs2_{prepare,commit}_write() to zero the
+		 * block_start is block-aligned.  Bump it by one to force
+		 * __block_write_begin and block_commit_write to zero the
 		 * whole block.
 		 */
-		ret = ocfs2_prepare_write_nolock(inode, page,
-						 block_start + 1,
-						 block_start + 1);
+		ret = __block_write_begin(page, block_start + 1, 0,
+					  ocfs2_get_block);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..4dcb88046030 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to);
 
 void reiserfs_evict_inode(struct inode *inode)
 {
@@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 }
 
 /* special version of get_block that is only used by grab_tail_page right
-** now.  It is sent to block_prepare_write, and when you try to get a
+** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
@@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
 	 **
 	 ** We must fix the tail page for writing because it might have buffers
 	 ** that are mapped, but have a block number of 0.  This indicates tail
-	 ** data that has been read directly into the page, and block_prepare_write
-	 ** won't trigger a get_block in this case.
+	 ** data that has been read directly into the page, and
+	 ** __block_write_begin won't trigger a get_block in this case.
 	 */
 	fix_tail_page_for_writing(tail_page);
-	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+	retval = __reiserfs_write_begin(tail_page, tail_start,
+				      tail_end - tail_start);
 	if (retval)
 		goto unlock;
 
@@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
 	/* start within the page of the last block in the file */
 	start = (offset / blocksize) * blocksize;
 
-	error = block_prepare_write(page, start, offset,
+	error = __block_write_begin(page, start, offset - start,
 				    reiserfs_get_block_create_0);
 	if (error)
 		goto unlock;
@@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
 	return ret;
 }
 
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to)
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
@@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 		th->t_refcount++;
 	}
 
-	ret = block_prepare_write(page, from, to, reiserfs_get_block);
+	ret = __block_write_begin(page, from, len, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..adf22b485cea 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to);
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
@@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	}
 
 	/* we unpack by finding the page with the tail, and calling
-	 ** reiserfs_prepare_write on that page.  This will force a
+	 ** __reiserfs_write_begin on that page.  This will force a
 	 ** reiserfs_get_block to unpack the tail for us.
 	 */
 	index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	if (!page) {
 		goto out;
 	}
-	retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+	retval = __reiserfs_write_begin(page, write_from, 0);
 	if (retval)
 		goto out_unlock;
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..f7415de13878 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,8 +418,6 @@ static inline __u32 xattr_hash(const char *msg, int len)
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to);
 
 static void update_ctime(struct inode *inode)
 {
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 			rxh->h_hash = cpu_to_le32(xahash);
 		}
 
-		err = reiserfs_prepare_write(NULL, page, page_offset,
-					    page_offset + chunk + skip);
+		err = __reiserfs_write_begin(page, page_offset, chunk + skip);
 		if (!err) {
 			if (buffer)
 				memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index ab31ce5aeaf9..cf808782c065 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -576,7 +576,7 @@ xfs_max_file_offset(
 
 	/* Figure out maximum filesize, on Linux this can depend on
 	 * the filesystem blocksize (on 32 bit platforms).
-	 * __block_prepare_write does this in an [unsigned] long...
+	 * __block_write_begin does this in an [unsigned] long...
 	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
 	 * So, for page sized blocks (4K on 32 bit platforms),
 	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index dd1b25b2641c..68d1fe7b877c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -212,7 +212,6 @@ int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
-int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 91a4177e60ce..5ca47e59b727 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2072,6 +2072,8 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
 
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
+
 /* namei.c */
 void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
 int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
-- 
cgit v1.2.3


From fde214d414218fb6cace35708730986bcc94fb53 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 4 Oct 2010 11:37:37 +0200
Subject: isofs: Fix isofs_get_blocks for 8TB files

Currently isofs_get_blocks() was limited to handle only 4TB files on 32-bit
architectures because of unnecessary use of iblock variable which was signed
long. Just remove the variable. The error messages that were using this
variable should have rather used b_off anyway because that is the block we
are currently mapping.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/inode.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 09ff41a752a0..60c2b944d762 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -962,25 +962,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
  * or getblk() if they are not.  Returns the number of blocks inserted
  * (-ve == error.)
  */
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
 		     struct buffer_head **bh, unsigned long nblocks)
 {
-	unsigned long b_off;
+	unsigned long b_off = iblock;
 	unsigned offset, sect_size;
 	unsigned int firstext;
 	unsigned long nextblk, nextoff;
-	long iblock = (long)iblock_s;
 	int section, rv, error;
 	struct iso_inode_info *ei = ISOFS_I(inode);
 
 	error = -EIO;
 	rv = 0;
-	if (iblock < 0 || iblock != iblock_s) {
+	if (iblock != b_off) {
 		printk(KERN_DEBUG "%s: block number too large\n", __func__);
 		goto abort;
 	}
 
-	b_off = iblock;
 
 	offset = 0;
 	firstext = ei->i_first_extent;
@@ -998,8 +996,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 		 * I/O errors.
 		 */
 		if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-			printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
-				__func__, iblock, (unsigned long) inode->i_size);
+			printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
+				__func__, b_off,
+				(unsigned long long)inode->i_size);
 			goto abort;
 		}
 
@@ -1025,9 +1024,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 			if (++section > 100) {
 				printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
 					" aborting...\n", __func__);
-				printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+				printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
 					"nextblk=%lu nextoff=%lu\n", __func__,
-					iblock, firstext, (unsigned) sect_size,
+					b_off, firstext, (unsigned) sect_size,
 					nextblk, nextoff);
 				goto abort;
 			}
-- 
cgit v1.2.3


From 7e360c38abe2c70eae3ba5a8a17f17671d8b77c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 5 Oct 2010 09:32:55 +0200
Subject: fs: allow for more than 2^31 files

Andrew,

Could you please review this patch, you probably are the right guy to
take it, because it crosses fs and net trees.

Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt
depend on previous patch (sysctl: fix min/max handling in
__do_proc_doulongvec_minmax())

Thanks !

[PATCH V4] fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot.  The kernel was incapable
of creating either a named pipe or unix domain socket.  This comes down
to a common kernel function called unix_create1() which does:

        atomic_inc(&unix_nr_socks);
        if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000).  That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of
atomic_t.

get_max_files() is changed to return an unsigned long.
get_nr_files() is changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.

Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704     0       2147483648

Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 17 +++++++----------
 include/linux/fs.h |  8 ++++----
 kernel/sysctl.c    |  6 +++---
 net/unix/af_unix.c | 14 +++++++-------
 4 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
  * Return the total number of open files in the system
  */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
 	return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
  * Return the maximum number of open files in the system
  */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
 	return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
 	const struct cred *cred = current_cred();
-	static int old_max;
+	static long old_max;
 	struct file * f;
 
 	/*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
 	/* Ran out of filps - report that */
 	if (get_nr_files() > old_max) {
-		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					get_max_files());
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
 		old_max = get_nr_files();
 	}
 	goto fail;
@@ -487,7 +486,7 @@ retry:
 
 void __init files_init(unsigned long mempages)
 { 
-	int n; 
+	unsigned long n;
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
 	 */ 
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
-	files_stat.max_files = n; 
-	if (files_stat.max_files < NR_FILE)
-		files_stat.max_files = NR_FILE;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
 	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0a5d83633884..0cd6821013a0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,9 +34,9 @@
 
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
-	int nr_files;		/* read only */
-	int nr_free_files;	/* read only */
-	int max_files;		/* tunable */
+	unsigned long nr_files;		/* read only */
+	unsigned long nr_free_files;	/* read only */
+	unsigned long max_files;		/* tunable */
 };
 
 struct inodes_stat_t {
@@ -400,7 +400,7 @@ extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 
 extern struct files_stat_struct files_stat;
-extern int get_max_files(void);
+extern unsigned long get_max_files(void);
 extern int sysctl_nr_open;
 extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..694b140852c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "file-nr",
 		.data		= &files_stat,
-		.maxlen		= 3*sizeof(int),
+		.maxlen		= sizeof(files_stat),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_files,
 	},
 	{
 		.procname	= "file-max",
 		.data		= &files_stat.max_files,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(files_stat.max_files),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "nr_open",
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0ebc777a6660..3c95304a0817 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
-static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
 	if (u->addr)
 		unix_release_addr(u->addr);
 
-	atomic_dec(&unix_nr_socks);
+	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 #ifdef UNIX_REFCNT_DEBUG
-	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk,
-		atomic_read(&unix_nr_socks));
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
 #endif
 }
 
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	atomic_inc(&unix_nr_socks);
-	if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
-		atomic_dec(&unix_nr_socks);
+		atomic_long_dec(&unix_nr_socks);
 	else {
 		local_bh_disable();
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-- 
cgit v1.2.3


From 0e45b67d5aeb3dcfb6b149cf61c30b9a8e503f74 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 25 Aug 2010 09:05:27 +0200
Subject: affs: testing the wrong variable

The intent was to verify that bh = affs_bread_ino(...) returned a valid
pointer.  We checked "ext_bh" earlier in the function and it's valid
here.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
 		if (AFFS_SB(sb)->s_flags & SF_OFS) {
 			struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
 			u32 tmp;
-			if (IS_ERR(ext_bh)) {
+			if (IS_ERR(bh)) {
 				affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
-					     ext, PTR_ERR(ext_bh));
+					     ext, PTR_ERR(bh));
 				return;
 			}
 			tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
-- 
cgit v1.2.3


From ba10f486658c0ca1bc84c936f6a996e40d071453 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 19 Oct 2010 14:27:59 -0700
Subject: hostfs: fix UML crash: remove f_spare from hostfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

365b1818 ("add f_flags to struct statfs(64)") resized f_spare within
struct statfs which caused a UML crash.  There is no need to copy f_spare.

Signed-off-by: Richard Weinberger <richard@nod.at>
Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Tested-by: Toralf Förster <toralf.foerster@gmx.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hostfs/hostfs.h      | 3 +--
 fs/hostfs/hostfs_kern.c | 2 +-
 fs/hostfs/hostfs_user.c | 9 ++-------
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 7c232c1487ee..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
 		     long long *files_out, long long *ffree_out,
-		     void *fsid_out, int fsid_size, long *namelen_out,
-		     long *spare_out);
+		     void *fsid_out, int fsid_size, long *namelen_out);
 
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..cd7c93917cc7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 	err = do_statfs(dentry->d_sb->s_fs_info,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-			&sf->f_namelen, sf->f_spare);
+			&sf->f_namelen);
 	if (err)
 		return err;
 	sf->f_blocks = f_blocks;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..8d02683585e0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -364,8 +364,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	      long long *bfree_out, long long *bavail_out,
 	      long long *files_out, long long *ffree_out,
-	      void *fsid_out, int fsid_size, long *namelen_out,
-	      long *spare_out)
+	      void *fsid_out, int fsid_size, long *namelen_out)
 {
 	struct statfs64 buf;
 	int err;
@@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	       sizeof(buf.f_fsid) > fsid_size ? fsid_size :
 	       sizeof(buf.f_fsid));
 	*namelen_out = buf.f_namelen;
-	spare_out[0] = buf.f_spare[0];
-	spare_out[1] = buf.f_spare[1];
-	spare_out[2] = buf.f_spare[2];
-	spare_out[3] = buf.f_spare[3];
-	spare_out[4] = buf.f_spare[4];
+
 	return 0;
 }
-- 
cgit v1.2.3


From 4a3956c790290efeb647bbb0c3a90476bb57800e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 1 Oct 2010 14:20:22 -0700
Subject: vfs: introduce FMODE_UNSIGNED_OFFSET for allowing negative f_pos

Now, rw_verify_area() checsk f_pos is negative or not.  And if negative,
returns -EINVAL.

But, some special files as /dev/(k)mem and /proc/<pid>/mem etc..  has
negative offsets.  And we can't do any access via read/write to the
file(device).

So introduce FMODE_UNSIGNED_OFFSET to allow negative file offsets.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/mem.c |  4 ++++
 fs/proc/base.c     |  2 ++
 fs/read_write.c    | 28 ++++++++++++++++++++++++----
 include/linux/fs.h |  3 +++
 4 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index e985b1c2730e..1256454b2d43 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -876,6 +876,10 @@ static int memory_open(struct inode *inode, struct file *filp)
 	if (dev->dev_info)
 		filp->f_mapping->backing_dev_info = dev->dev_info;
 
+	/* Is /dev/mem or /dev/kmem ? */
+	if (dev->dev_info == &directly_mappable_cdev_bdi)
+		filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+
 	if (dev->fops->open)
 		return dev->fops->open(inode, filp);
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5d5f51f3fe..fb2a5abd4e4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
 	file->private_data = (void*)((long)current->self_exec_id);
+	/* OK to pass negative loff_t, we can catch out-of-range */
+	file->f_mode |= FMODE_UNSIGNED_OFFSET;
 	return 0;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index e757ef26e4ce..9cd9d148105d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
+static int
+__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+{
+	/*
+	 * pos or pos+count is negative here, check overflow.
+	 * too big "count" will be caught in rw_verify_area().
+	 */
+	if ((pos < 0) && (pos + count < pos))
+		return -EOVERFLOW;
+	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+		return 0;
+	return -EINVAL;
+}
+
 /**
  * generic_file_llseek_unlocked - lockless generic llseek implementation
  * @file:	file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		break;
 	}
 
-	if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+	if (offset < 0 && __negative_fpos_check(file, offset, 0))
+		return -EINVAL;
+	if (offset > inode->i_sb->s_maxbytes)
 		return -EINVAL;
 
 	/* Special lock needed here? */
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
-	if (offset >= 0) {
+	if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
@@ -221,6 +237,7 @@ bad:
 }
 #endif
 
+
 /*
  * rw_verify_area doesn't like huge counts. We limit
  * them to something that fits in "int" so that others
@@ -238,8 +255,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	if (unlikely((ssize_t) count < 0))
 		return retval;
 	pos = *ppos;
-	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
-		return retval;
+	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
+		retval = __negative_fpos_check(file, pos, count);
+		if (retval)
+			return retval;
+	}
 
 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 		retval = locks_mandatory_area(
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0cd6821013a0..7fc126df1c42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -92,6 +92,9 @@ struct inodes_stat_t {
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)0x1000)
 
+/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
-- 
cgit v1.2.3


From 3072b90c4726396e38160cc1f7c93191e2f4a566 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Oct 2010 10:49:17 +0200
Subject: hfs: use sync_dirty_buffer

Use sync_dirty_buffer instead of the incorrect opencoding it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/hfs_fs.h | 11 -----------
 fs/hfs/mdb.c    |  4 ++--
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..1efcbc765d3c 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -254,17 +254,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
 	sb->s_dirt = 1;
 }
 
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
-	while (buffer_locked(bh)) {
-		wait_on_buffer(bh);
-	}
-	if (buffer_dirty(bh)) {
-		ll_rw_block(WRITE, 1, &bh);
-		wait_on_buffer(bh);
-	}
-}
-
 #define sb_bread512(sb, sec, data) ({			\
 	struct buffer_head *__bh;			\
 	sector_t __block;				\
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
 		mdb->drLsMod = hfs_mtime();
 
 		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
-		hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
 	}
 
 	return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
 		HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
 		HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
 		mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
-		hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
 	}
 
 	if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
-- 
cgit v1.2.3


From bb1e5f8c022ae2f6bf1ec0c6b861db3c35976377 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 15 Oct 2010 16:32:27 -0700
Subject: fs: move exportfs since it is not a networking filesystem

Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block
since it provides a library function that can be (and is) used by other
(non-network) filesystems.

This also eliminates a kconfig dependency warning:

warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alex Elder <aelder@sgi.com>
Cc: xfs-masters@oss.sgi.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 65781de44fc0..b5e582bd769d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
+config EXPORTFS
+	tristate
+
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
@@ -222,9 +225,6 @@ config LOCKD_V4
 	depends on FILE_LOCKING
 	default y
 
-config EXPORTFS
-	tristate
-
 config NFS_ACL_SUPPORT
 	tristate
 	select FS_POSIX_ACL
-- 
cgit v1.2.3


From 8358e7d71e712d3bd4e20ecf23e6fd7480c83684 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sat, 16 Oct 2010 12:40:33 +0900
Subject: fs/buffer.c: remove duplicated assignment on b_private

bh->b_private is initialized within init_buffer(), thus the
assignment should be redundant. Remove it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index a7b8f3c59a4e..74566c6f67b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -905,7 +905,6 @@ try_again:
 
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
-		bh->b_private = NULL;
 		bh->b_size = size;
 
 		/* Link the buffer to its page */
-- 
cgit v1.2.3


From 306fb0979443419288594446a348155a8027dcf2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 23 Aug 2010 10:47:55 -0400
Subject: aio: bump i_count instead of using igrab

The aio batching code is using igrab to get an extra reference on the
inode so it can safely batch.  igrab will go ahead and take the global
inode spinlock, which can be a bottleneck on large machines doing lots
of AIO.

In this case, igrab isn't required because we already have a reference
on the file handle.  It is safe to just bump the i_count directly
on the inode.

Benchmarking shows this patch brings IOP/s on tons of flash up by about
2.5X.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/aio.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..9e319a04780e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1543,7 +1543,20 @@ static void aio_batch_add(struct address_space *mapping,
 	}
 
 	abe = mempool_alloc(abe_pool, GFP_KERNEL);
-	BUG_ON(!igrab(mapping->host));
+
+	/*
+	 * we should be using igrab here, but
+	 * we don't want to hammer on the global
+	 * inode spinlock just to take an extra
+	 * reference on a file that we must already
+	 * have a reference to.
+	 *
+	 * When we're called, we always have a reference
+	 * on the file, so we must always have a reference
+	 * on the inode, so igrab must always just
+	 * bump the count and move on.
+	 */
+	atomic_inc(&mapping->host->i_count);
 	abe->mapping = mapping;
 	hlist_add_head(&abe->list, &batch_hash[bucket]);
 	return;
-- 
cgit v1.2.3


From a3314a0ed389f51a51695120b429eccd45b3a165 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 11 Oct 2010 22:38:00 +0900
Subject: lockdep: fixup checking of dir inode annotation

Since inode->i_mode shares its bits for S_IFMT, S_ISDIR should be
used to distinguish whether it is a dir or not.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 3368abd64bb5..4f0a67c54f89 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -663,7 +663,7 @@ EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-	if (inode->i_mode & S_IFDIR) {
+	if (S_ISDIR(inode->i_mode)) {
 		struct file_system_type *type = inode->i_sb->s_type;
 
 		/* Set new key only if filesystem hasn't already changed it */
-- 
cgit v1.2.3


From 309f77ad9bea057d55b04580b5a711e9e3727e83 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 25 Oct 2010 15:01:12 +0900
Subject: fs/buffer.c: call __block_write_begin() if we have page

If we have the appropriate page already, call __block_write_begin()
directly instead of releasing and regrabbing it inside of
block_write_begin().

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 74566c6f67b1..d895d9fd5b71 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2458,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping,
 	*fsdata = NULL;
 
 	if (page_has_buffers(page)) {
-		unlock_page(page);
-		page_cache_release(page);
-		*pagep = NULL;
-		return block_write_begin(mapping, pos, len, flags, pagep,
-					 get_block);
+		ret = __block_write_begin(page, pos, len, get_block);
+		if (unlikely(ret))
+			goto out_release;
+		return ret;
 	}
 
 	if (PageMappedToDisk(page))
-- 
cgit v1.2.3


From 8e3b9a072d071700e83e88b0bf59115c59042885 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Oct 2010 09:45:22 -0400
Subject: ext2_remount: don't bother with invalidate_inodes()

It's pointless - we *do* have busy inodes (root directory,
for one), so that call will fail and attempt to change
XIP flag will be ignored.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/super.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 85df87d0f7b7..0901320671da 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1221,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	}
 
 	es = sbi->s_es;
-	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
-	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
-	    invalidate_inodes(sb)) {
+	if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
 		ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
 			 "xip flag with busy inodes while remounting");
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
-- 
cgit v1.2.3


From 9dcefee508d547eed88f3c578dc92819bdeaa952 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Oct 2010 09:46:50 -0400
Subject: gfs2: invalidate_inodes() is no-op there

In fill_super() we hadn't MS_ACTIVE set yet, so there won't
be any inodes with zero i_count sitting around.

In put_super() we already have MS_ACTIVE removed *and* we
had called invalidate_inodes() since then.  So again there
won't be any inodes with zero i_count...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/ops_fstype.c | 1 -
 fs/gfs2/super.c      | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aeafc233dc89..cade1acbcea9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1219,7 +1219,6 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	invalidate_inodes(sb);
 	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 fail_sys:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 047d1176096c..2b2c4997430b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -857,7 +857,6 @@ restart:
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
-	invalidate_inodes(sdp->sd_vfs);
 	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
-- 
cgit v1.2.3


From 70fd136ecc44ca8c364f4412f447de1d940c3639 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Oct 2010 09:50:11 -0400
Subject: ntfs: don't call invalidate_inodes()

We are in fill_super(); again, no inodes with zero i_count could
be around until we set MS_ACTIVE.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ntfs/super.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 19c5180f8a28..d4dec2d32117 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3021,21 +3021,6 @@ iput_tmp_ino_err_out_now:
 	if (vol->mft_ino && vol->mft_ino != tmp_ino)
 		iput(vol->mft_ino);
 	vol->mft_ino = NULL;
-	/*
-	 * This is needed to get ntfs_clear_extent_inode() called for each
-	 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-	 * leak resources and B) a subsequent mount fails automatically due to
-	 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-	 * method again... FIXME: Do we need to do this twice now because of
-	 * attribute inodes? I think not, so leave as is for now... (AIA)
-	 */
-	if (invalidate_inodes(sb)) {
-		ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-				"driver bug.");
-		/* Copied from fs/super.c. I just love this message. (-; */
-		printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-				"seconds.  Have a nice day...\n");
-	}
 	/* Errors at this stage are irrelevant. */
 err_out_now:
 	sb->s_fs_info = NULL;
-- 
cgit v1.2.3


From 61ebdb4254e3ecb59022d2c730b57b04d0eeecc6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Oct 2010 10:48:15 -0400
Subject: smbfs never retains inodes with zero refcount in the first place

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/smbfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 8fc5e50e142f..f6e9ee59757e 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -229,7 +229,6 @@ smb_invalidate_inodes(struct smb_sb_info *server)
 {
 	VERBOSE("\n");
 	shrink_dcache_sb(SB_of(server));
-	invalidate_inodes(SB_of(server));
 }
 
 /*
-- 
cgit v1.2.3


From a8dade34e3df581bc36ca2afe6e27055e178801c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Oct 2010 11:13:10 -0400
Subject: unexport invalidate_inodes

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 1 -
 fs/internal.h      | 5 +++++
 include/linux/fs.h | 1 -
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 4f0a67c54f89..db7c74c7dd80 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -417,7 +417,6 @@ int invalidate_inodes(struct super_block *sb)
 
 	return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
 
 static int can_unuse(struct inode *inode)
 {
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..f6dce46d80dc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,8 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+
+/*
+ * inode.c
+ */
+extern int invalidate_inodes(struct super_block *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7fc126df1c42..c3f6daf749cc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2082,7 +2082,6 @@ extern int check_disk_change(struct block_device *);
 extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
-extern int invalidate_inodes(struct super_block *);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 
-- 
cgit v1.2.3


From 1d3382cbf02986e4833849f528d451367ea0b4cb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Oct 2010 15:19:20 -0400
Subject: new helper: inode_unhashed()

note: for race-free uses you inode_lock held

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c    | 2 +-
 fs/fs-writeback.c   | 2 +-
 fs/inode.c          | 6 +++---
 fs/reiserfs/xattr.c | 2 +-
 include/linux/fs.h  | 5 +++++
 mm/shmem.c          | 4 ++--
 6 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..f6f2a0da2695 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3849,7 +3849,7 @@ again:
 	p = &root->inode_tree.rb_node;
 	parent = NULL;
 
-	if (hlist_unhashed(&inode->i_hash))
+	if (inode_unhashed(inode))
 		return;
 
 	spin_lock(&root->inode_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 29e3f409bbd0..39f44f2e709a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -962,7 +962,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * dirty list.  Add blockdev inodes as well.
 		 */
 		if (!S_ISBLK(inode->i_mode)) {
-			if (hlist_unhashed(&inode->i_hash))
+			if (inode_unhashed(inode))
 				goto out;
 		}
 		if (inode->i_state & I_FREEING)
diff --git a/fs/inode.c b/fs/inode.c
index db7c74c7dd80..4440cf1034ec 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1094,7 +1094,7 @@ int insert_inode_locked(struct inode *inode)
 		__iget(old);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
-		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
 		}
@@ -1133,7 +1133,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		__iget(old);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
-		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
 		}
@@ -1186,7 +1186,7 @@ EXPORT_SYMBOL(generic_delete_inode);
  */
 int generic_drop_inode(struct inode *inode)
 {
-	return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+	return !inode->i_nlink || inode_unhashed(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f7415de13878..5d04a7828e7a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -422,7 +422,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 static void update_ctime(struct inode *inode)
 {
 	struct timespec now = current_fs_time(inode->i_sb);
-	if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+	if (inode_unhashed(inode) || !inode->i_nlink ||
 	    timespec_equal(&inode->i_ctime, &now))
 		return;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3f6daf749cc..78043da85e1f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -786,6 +786,11 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 };
 
+static inline int inode_unhashed(struct inode *inode)
+{
+	return hlist_unhashed(&inode->i_hash);
+}
+
 /*
  * inode->i_mutex nesting subclasses for the lock validator:
  *
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..27a58120dbd5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2146,7 +2146,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 	if (*len < 3)
 		return 255;
 
-	if (hlist_unhashed(&inode->i_hash)) {
+	if (inode_unhashed(inode)) {
 		/* Unfortunately insert_inode_hash is not idempotent,
 		 * so as we hash inodes here rather than at creation
 		 * time, we need a lock to ensure we only try
@@ -2154,7 +2154,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 		 */
 		static DEFINE_SPINLOCK(lock);
 		spin_lock(&lock);
-		if (hlist_unhashed(&inode->i_hash))
+		if (inode_unhashed(inode))
 			__insert_inode_hash(inode,
 					    inode->i_ino + inode->i_generation);
 		spin_unlock(&lock);
-- 
cgit v1.2.3


From 756acc2d61712a8cafe2aa6ad626c60a185d3645 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Oct 2010 15:23:40 -0400
Subject: list.h: new helper - hlist_add_fake()

Make node look as if it was on hlist, with hlist_del()
working correctly.  Usable without any locking...

Convert a couple of places where we want to do that to
inode->i_hash.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/inode.c   | 2 +-
 fs/jfs/jfs_imap.c    | 2 +-
 include/linux/list.h | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 78449280dae0..8afd7e84f98d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	 * appear hashed, but do not put on any lists.  hlist_del()
 	 * will work fine and require no locking.
 	 */
-	inode->i_hash.pprev = &inode->i_hash.next;
+	hlist_add_fake(&inode->i_hash);
 
 	mark_inode_dirty(inode);
 out:
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	 * appear hashed, but do not put on any lists.  hlist_del()
 	 * will work fine and require no locking.
 	 */
-	ip->i_hash.pprev = &ip->i_hash.next;
+	hlist_add_fake(&ip->i_hash);
 
 	return (ip);
 }
diff --git a/include/linux/list.h b/include/linux/list.h
index 88a000617d77..9a5f8a71810c 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -636,6 +636,12 @@ static inline void hlist_add_after(struct hlist_node *n,
 		next->next->pprev  = &next->next;
 }
 
+/* after that we'll appear to be on some hlist and hlist_del will work */
+static inline void hlist_add_fake(struct hlist_node *n)
+{
+	n->pprev = &n->next;
+}
+
 /*
  * Move a list from one list head to another. Fixup the pprev
  * reference of the first entry if it exists.
-- 
cgit v1.2.3


From 89b0fc38cca4e6c92a90b58960881ffc5dddd89c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Oct 2010 15:26:21 -0400
Subject: switch hfs to hlist_add_fake()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/hfs_fs.h | 2 --
 fs/hfs/inode.c  | 2 +-
 fs/hfs/super.c  | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 1efcbc765d3c..c8cffb81e849 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
 	u16 blockoffset;
 
 	int fs_div;
-
-	struct hlist_head rsrc_inodes;
 };
 
 #define HFS_FLG_BITMAP_DIRTY	0
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
 	HFS_I(inode)->rsrc_inode = dir;
 	HFS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+	hlist_add_fake(&inode->i_hash);
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 33254160f650..6ee1586f2334 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -382,7 +382,6 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
-	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
 
 	res = -EINVAL;
 	if (!parse_options((char *)data, sbi)) {
-- 
cgit v1.2.3


From be1a16a0ae29a7c90081a657b64aa51cb1a65a27 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 5 Oct 2010 12:31:09 +0200
Subject: vfs: fix infinite loop caused by clone_mnt race

If clone_mnt() happens while mnt_make_readonly() is running, the
cloned mount might have MNT_WRITE_HOLD flag set, which results in
mnt_want_write() spinning forever on this mount.

Needs CAP_SYS_ADMIN to trigger deliberately and unlikely to happen
accidentally.  But if it does happen it can hang the machine.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7ca5182c0bed..8a415c9c5e55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 				goto out_free;
 		}
 
-		mnt->mnt_flags = old->mnt_flags;
+		mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
 		atomic_inc(&sb->s_active);
 		mnt->mnt_sb = sb;
 		mnt->mnt_root = dget(root);
-- 
cgit v1.2.3


From cffbc8aa334f55c9ed42d25202eb3ebf3a97c195 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sat, 23 Oct 2010 05:03:02 -0400
Subject: fs: Convert nr_inodes and nr_unused to per-cpu counters

The number of inodes allocated does not need to be tied to the
addition or removal of an inode to/from a list. If we are not tied
to a list lock, we could update the counters when inodes are
initialised or destroyed, but to do that we need to convert the
counters to be per-cpu (i.e. independent of a lock). This means that
we have the freedom to change the list/locking implementation
without needing to care about the counters.

Based on a patch originally from Eric Dumazet.

[AV: cleaned up a bit, fixed build breakage on weird configs

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c  |  5 ++---
 fs/inode.c         | 64 ++++++++++++++++++++++++++++++++++++++----------------
 fs/internal.h      |  1 +
 include/linux/fs.h |  3 ++-
 kernel/sysctl.c    |  4 ++--
 5 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 39f44f2e709a..f04d04af84f2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -723,7 +723,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 	wb->last_old_flush = jiffies;
 	nr_pages = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+			get_nr_dirty_inodes();
 
 	if (nr_pages) {
 		struct wb_writeback_work work = {
@@ -1090,8 +1090,7 @@ void writeback_inodes_sb(struct super_block *sb)
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	work.nr_pages = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
 
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
diff --git a/fs/inode.c b/fs/inode.c
index 4440cf1034ec..0d5aeccbdd90 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -103,8 +103,41 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
+static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+
 static struct kmem_cache *inode_cachep __read_mostly;
 
+static inline int get_nr_inodes(void)
+{
+	return percpu_counter_sum_positive(&nr_inodes);
+}
+
+static inline int get_nr_inodes_unused(void)
+{
+	return percpu_counter_sum_positive(&nr_inodes_unused);
+}
+
+int get_nr_dirty_inodes(void)
+{
+	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	return nr_dirty > 0 ? nr_dirty : 0;
+
+}
+
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	inodes_stat.nr_inodes = get_nr_inodes();
+	inodes_stat.nr_unused = get_nr_inodes_unused();
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
 static void wake_up_inode(struct inode *inode)
 {
 	/*
@@ -192,6 +225,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_fsnotify_mask = 0;
 #endif
 
+	percpu_counter_inc(&nr_inodes);
+
 	return 0;
 out:
 	return -ENOMEM;
@@ -232,6 +267,7 @@ void __destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
+	percpu_counter_dec(&nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 
@@ -286,7 +322,7 @@ void __iget(struct inode *inode)
 
 	if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 		list_move(&inode->i_list, &inode_in_use);
-	inodes_stat.nr_unused--;
+	percpu_counter_dec(&nr_inodes_unused);
 }
 
 void end_writeback(struct inode *inode)
@@ -327,8 +363,6 @@ static void evict(struct inode *inode)
  */
 static void dispose_list(struct list_head *head)
 {
-	int nr_disposed = 0;
-
 	while (!list_empty(head)) {
 		struct inode *inode;
 
@@ -344,11 +378,7 @@ static void dispose_list(struct list_head *head)
 
 		wake_up_inode(inode);
 		destroy_inode(inode);
-		nr_disposed++;
 	}
-	spin_lock(&inode_lock);
-	inodes_stat.nr_inodes -= nr_disposed;
-	spin_unlock(&inode_lock);
 }
 
 /*
@@ -357,7 +387,7 @@ static void dispose_list(struct list_head *head)
 static int invalidate_list(struct list_head *head, struct list_head *dispose)
 {
 	struct list_head *next;
-	int busy = 0, count = 0;
+	int busy = 0;
 
 	next = head->next;
 	for (;;) {
@@ -383,13 +413,11 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 			list_move(&inode->i_list, dispose);
 			WARN_ON(inode->i_state & I_NEW);
 			inode->i_state |= I_FREEING;
-			count++;
+			percpu_counter_dec(&nr_inodes_unused);
 			continue;
 		}
 		busy = 1;
 	}
-	/* only unused inodes may be cached with i_count zero */
-	inodes_stat.nr_unused -= count;
 	return busy;
 }
 
@@ -447,7 +475,6 @@ static int can_unuse(struct inode *inode)
 static void prune_icache(int nr_to_scan)
 {
 	LIST_HEAD(freeable);
-	int nr_pruned = 0;
 	int nr_scanned;
 	unsigned long reap = 0;
 
@@ -483,9 +510,8 @@ static void prune_icache(int nr_to_scan)
 		list_move(&inode->i_list, &freeable);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
-		nr_pruned++;
+		percpu_counter_dec(&nr_inodes_unused);
 	}
-	inodes_stat.nr_unused -= nr_pruned;
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
@@ -517,7 +543,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 			return -1;
 		prune_icache(nr);
 	}
-	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker icache_shrinker = {
@@ -594,7 +620,6 @@ static inline void
 __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 			struct inode *inode)
 {
-	inodes_stat.nr_inodes++;
 	list_add(&inode->i_list, &inode_in_use);
 	list_add(&inode->i_sb_list, &sb->s_inodes);
 	if (head)
@@ -1214,7 +1239,7 @@ static void iput_final(struct inode *inode)
 	if (!drop) {
 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 			list_move(&inode->i_list, &inode_unused);
-		inodes_stat.nr_unused++;
+		percpu_counter_inc(&nr_inodes_unused);
 		if (sb->s_flags & MS_ACTIVE) {
 			spin_unlock(&inode_lock);
 			return;
@@ -1226,14 +1251,13 @@ static void iput_final(struct inode *inode)
 		spin_lock(&inode_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
-		inodes_stat.nr_unused--;
+		percpu_counter_dec(&nr_inodes_unused);
 		hlist_del_init(&inode->i_hash);
 	}
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 	evict(inode);
 	spin_lock(&inode_lock);
@@ -1502,6 +1526,8 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
+	percpu_counter_init(&nr_inodes, 0);
+	percpu_counter_init(&nr_inodes_unused, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index f6dce46d80dc..4cc67eb6ed56 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -105,4 +105,5 @@ extern void release_open_intent(struct nameidata *);
 /*
  * inode.c
  */
+extern int get_nr_dirty_inodes(void);
 extern int invalidate_inodes(struct super_block *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 78043da85e1f..a3937a8ee95e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2486,7 +2486,8 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
-
+int proc_nr_inodes(struct ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos);
 int __init get_filesystem_list(char *buf);
 
 #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 694b140852c2..99a510cbfbb3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1340,14 +1340,14 @@ static struct ctl_table fs_table[] = {
 		.data		= &inodes_stat,
 		.maxlen		= 2*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_nr_inodes,
 	},
 	{
 		.procname	= "inode-state",
 		.data		= &inodes_stat,
 		.maxlen		= 7*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_nr_inodes,
 	},
 	{
 		.procname	= "file-nr",
-- 
cgit v1.2.3


From 9e38d86ff2d8a8db99570e982230861046df32b5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 23 Oct 2010 06:55:17 -0400
Subject: fs: Implement lazy LRU updates for inodes

Convert the inode LRU to use lazy updates to reduce lock and
cacheline traffic.  We avoid moving inodes around in the LRU list
during iget/iput operations so these frequent operations don't need
to access the LRUs. Instead, we defer the refcount checks to
reclaim-time and use a per-inode state flag, I_REFERENCED, to tell
reclaim that iget has touched the inode in the past. This means that
only reclaim should be touching the LRU with any frequency, hence
significantly reducing lock acquisitions and the amount contention
on LRU updates.

This also removes the inode_in_use list, which means we now only
have one list for tracking the inode LRU status. This makes it much
simpler to split out the LRU list operations under it's own lock.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c         | 11 +++---
 fs/inode.c                | 86 +++++++++++++++++++++++++++++++++--------------
 include/linux/fs.h        | 13 +++----
 include/linux/writeback.h |  2 --
 4 files changed, 71 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f04d04af84f2..e8f65290e836 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -408,16 +408,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			 * completion.
 			 */
 			redirty_tail(inode);
-		} else if (atomic_read(&inode->i_count)) {
-			/*
-			 * The inode is clean, inuse
-			 */
-			list_move(&inode->i_list, &inode_in_use);
 		} else {
 			/*
-			 * The inode is clean, unused
+			 * The inode is clean.  At this point we either have
+			 * a reference to the inode or it's on it's way out.
+			 * No need to add it back to the LRU.
 			 */
-			list_move(&inode->i_list, &inode_unused);
+			list_del_init(&inode->i_list);
 		}
 	}
 	inode_sync_complete(inode);
diff --git a/fs/inode.c b/fs/inode.c
index 0d5aeccbdd90..3bdc76f1653a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
  * allowing for low-overhead inode sync() operations.
  */
 
-LIST_HEAD(inode_in_use);
-LIST_HEAD(inode_unused);
+static LIST_HEAD(inode_unused);
 static struct hlist_head *inode_hashtable __read_mostly;
 
 /*
@@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
+	INIT_LIST_HEAD(&inode->i_list);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -317,12 +317,23 @@ static void init_once(void *foo)
  */
 void __iget(struct inode *inode)
 {
-	if (atomic_inc_return(&inode->i_count) != 1)
-		return;
+	atomic_inc(&inode->i_count);
+}
 
-	if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-		list_move(&inode->i_list, &inode_in_use);
-	percpu_counter_dec(&nr_inodes_unused);
+static void inode_lru_list_add(struct inode *inode)
+{
+	if (list_empty(&inode->i_list)) {
+		list_add(&inode->i_list, &inode_unused);
+		percpu_counter_inc(&nr_inodes_unused);
+	}
+}
+
+static void inode_lru_list_del(struct inode *inode)
+{
+	if (!list_empty(&inode->i_list)) {
+		list_del_init(&inode->i_list);
+		percpu_counter_dec(&nr_inodes_unused);
+	}
 }
 
 void end_writeback(struct inode *inode)
@@ -367,7 +378,7 @@ static void dispose_list(struct list_head *head)
 		struct inode *inode;
 
 		inode = list_first_entry(head, struct inode, i_list);
-		list_del(&inode->i_list);
+		list_del_init(&inode->i_list);
 
 		evict(inode);
 
@@ -413,7 +424,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 			list_move(&inode->i_list, dispose);
 			WARN_ON(inode->i_state & I_NEW);
 			inode->i_state |= I_FREEING;
-			percpu_counter_dec(&nr_inodes_unused);
+			if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+				percpu_counter_dec(&nr_inodes_unused);
 			continue;
 		}
 		busy = 1;
@@ -448,7 +460,7 @@ int invalidate_inodes(struct super_block *sb)
 
 static int can_unuse(struct inode *inode)
 {
-	if (inode->i_state)
+	if (inode->i_state & ~I_REFERENCED)
 		return 0;
 	if (inode_has_buffers(inode))
 		return 0;
@@ -460,17 +472,20 @@ static int can_unuse(struct inode *inode)
 }
 
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+ * temporary list and then are freed outside inode_lock by dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  We expect the final iput() on that inode to add it to
- * the front of the inode_unused list.  So look for it there and if the
- * inode is still freeable, proceed.  The right inode is found 99.9% of the
- * time in testing on a 4-way.
+ * pagecache removed.  If the inode has metadata buffers attached to
+ * mapping->private_list then try to remove them.
  *
- * If the inode has metadata buffers attached to mapping->private_list then
- * try to remove them.
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
  */
 static void prune_icache(int nr_to_scan)
 {
@@ -488,8 +503,21 @@ static void prune_icache(int nr_to_scan)
 
 		inode = list_entry(inode_unused.prev, struct inode, i_list);
 
-		if (inode->i_state || atomic_read(&inode->i_count)) {
+		/*
+		 * Referenced or dirty inodes are still in use. Give them
+		 * another pass through the LRU as we canot reclaim them now.
+		 */
+		if (atomic_read(&inode->i_count) ||
+		    (inode->i_state & ~I_REFERENCED)) {
+			list_del_init(&inode->i_list);
+			percpu_counter_dec(&nr_inodes_unused);
+			continue;
+		}
+
+		/* recently referenced inodes get one more pass */
+		if (inode->i_state & I_REFERENCED) {
 			list_move(&inode->i_list, &inode_unused);
+			inode->i_state &= ~I_REFERENCED;
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -620,7 +648,6 @@ static inline void
 __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 			struct inode *inode)
 {
-	list_add(&inode->i_list, &inode_in_use);
 	list_add(&inode->i_sb_list, &sb->s_inodes);
 	if (head)
 		hlist_add_head(&inode->i_hash, head);
@@ -1237,10 +1264,11 @@ static void iput_final(struct inode *inode)
 		drop = generic_drop_inode(inode);
 
 	if (!drop) {
-		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-			list_move(&inode->i_list, &inode_unused);
-		percpu_counter_inc(&nr_inodes_unused);
 		if (sb->s_flags & MS_ACTIVE) {
+			inode->i_state |= I_REFERENCED;
+			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+				inode_lru_list_add(inode);
+			}
 			spin_unlock(&inode_lock);
 			return;
 		}
@@ -1251,13 +1279,19 @@ static void iput_final(struct inode *inode)
 		spin_lock(&inode_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
-		percpu_counter_dec(&nr_inodes_unused);
 		hlist_del_init(&inode->i_hash);
 	}
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
+
+	/*
+	 * After we delete the inode from the LRU here, we avoid moving dirty
+	 * inodes back onto the LRU now because I_FREEING is set and hence
+	 * writeback_single_inode() won't move the inode around.
+	 */
+	inode_lru_list_del(inode);
+
+	list_del_init(&inode->i_sb_list);
 	spin_unlock(&inode_lock);
 	evict(inode);
 	spin_lock(&inode_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a3937a8ee95e..876275fc0638 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1641,16 +1641,17 @@ struct super_operations {
  *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
-#define I_DIRTY_SYNC		1
-#define I_DIRTY_DATASYNC	2
-#define I_DIRTY_PAGES		4
+#define I_DIRTY_SYNC		(1 << 0)
+#define I_DIRTY_DATASYNC	(1 << 1)
+#define I_DIRTY_PAGES		(1 << 2)
 #define __I_NEW			3
 #define I_NEW			(1 << __I_NEW)
-#define I_WILL_FREE		16
-#define I_FREEING		32
-#define I_CLEAR			64
+#define I_WILL_FREE		(1 << 4)
+#define I_FREEING		(1 << 5)
+#define I_CLEAR			(1 << 6)
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define I_REFERENCED		(1 << 8)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 72a5d647a5f2..242b6f812ba6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -10,8 +10,6 @@
 struct backing_dev_info;
 
 extern spinlock_t inode_lock;
-extern struct list_head inode_in_use;
-extern struct list_head inode_unused;
 
 /*
  * fs/fs-writeback.c
-- 
cgit v1.2.3


From 4c51acbc66f754e536e1c9e3331656b69bce86d0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sat, 23 Oct 2010 06:58:09 -0400
Subject: fs: Factor inode hash operations into functions

Before replacing the inode hash locking with a more scalable
mechanism, factor the removal of the inode from the hashes rather
than open coding it in several places.

Based on a patch originally from Nick Piggin.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 100 +++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 55 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 3bdc76f1653a..5e5bafe70ceb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -336,6 +336,58 @@ static void inode_lru_list_del(struct inode *inode)
 	}
 }
 
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+	unsigned long tmp;
+
+	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+			L1_CACHE_BYTES;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+	return tmp & I_HASHMASK;
+}
+
+/**
+ *	__insert_inode_hash - hash an inode
+ *	@inode: unhashed inode
+ *	@hashval: unsigned long value used to locate this object in the
+ *		inode_hashtable.
+ *
+ *	Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
+	spin_lock(&inode_lock);
+	hlist_add_head(&inode->i_hash, head);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+
+/**
+ *	__remove_inode_hash - remove an inode from the hash
+ *	@inode: inode to unhash
+ *
+ *	Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+	hlist_del_init(&inode->i_hash);
+}
+
+/**
+ *	remove_inode_hash - remove an inode from the hash
+ *	@inode: inode to unhash
+ *
+ *	Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
+
 void end_writeback(struct inode *inode)
 {
 	might_sleep();
@@ -383,7 +435,7 @@ static void dispose_list(struct list_head *head)
 		evict(inode);
 
 		spin_lock(&inode_lock);
-		hlist_del_init(&inode->i_hash);
+		__remove_inode_hash(inode);
 		list_del_init(&inode->i_sb_list);
 		spin_unlock(&inode_lock);
 
@@ -634,16 +686,6 @@ repeat:
 	return node ? inode : NULL;
 }
 
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-	unsigned long tmp;
-
-	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-			L1_CACHE_BYTES;
-	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-	return tmp & I_HASHMASK;
-}
-
 static inline void
 __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 			struct inode *inode)
@@ -1194,36 +1236,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 }
 EXPORT_SYMBOL(insert_inode_locked4);
 
-/**
- *	__insert_inode_hash - hash an inode
- *	@inode: unhashed inode
- *	@hashval: unsigned long value used to locate this object in the
- *		inode_hashtable.
- *
- *	Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
-	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-	spin_lock(&inode_lock);
-	hlist_add_head(&inode->i_hash, head);
-	spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
-
-/**
- *	remove_inode_hash - remove an inode from the hash
- *	@inode: inode to unhash
- *
- *	Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
-{
-	spin_lock(&inode_lock);
-	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(remove_inode_hash);
 
 int generic_delete_inode(struct inode *inode)
 {
@@ -1279,7 +1291,7 @@ static void iput_final(struct inode *inode)
 		spin_lock(&inode_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
-		hlist_del_init(&inode->i_hash);
+		__remove_inode_hash(inode);
 	}
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
@@ -1294,9 +1306,7 @@ static void iput_final(struct inode *inode)
 	list_del_init(&inode->i_sb_list);
 	spin_unlock(&inode_lock);
 	evict(inode);
-	spin_lock(&inode_lock);
-	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
+	remove_inode_hash(inode);
 	wake_up_inode(inode);
 	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	destroy_inode(inode);
-- 
cgit v1.2.3


From ad5e195ac9fdf4e2b28b8cf14937e5b9384dac2e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Oct 2010 07:00:16 -0400
Subject: fs: Stop abusing find_inode_fast in iunique

Stop abusing find_inode_fast for iunique and opencode the inode hash walk.
Introduce a new iunique_lock to protect the iunique counters once inode_lock
is removed.

Based on a patch originally from Nick Piggin.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 5e5bafe70ceb..a8035e8576df 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -884,6 +884,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 	return inode;
 }
 
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *b = inode_hashtable + hash(sb, ino);
+	struct hlist_node *node;
+	struct inode *inode;
+
+	hlist_for_each_entry(inode, node, b, i_hash) {
+		if (inode->i_ino == ino && inode->i_sb == sb)
+			return 0;
+	}
+
+	return 1;
+}
+
 /**
  *	iunique - get a unique inode number
  *	@sb: superblock
@@ -905,19 +926,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 	 * error if st_ino won't fit in target struct field. Use 32bit counter
 	 * here to attempt to avoid that.
 	 */
+	static DEFINE_SPINLOCK(iunique_lock);
 	static unsigned int counter;
-	struct inode *inode;
-	struct hlist_head *head;
 	ino_t res;
 
 	spin_lock(&inode_lock);
+	spin_lock(&iunique_lock);
 	do {
 		if (counter <= max_reserved)
 			counter = max_reserved + 1;
 		res = counter++;
-		head = inode_hashtable + hash(sb, res);
-		inode = find_inode_fast(sb, head, res);
-	} while (inode != NULL);
+	} while (!test_inode_iunique(sb, res));
+	spin_unlock(&iunique_lock);
 	spin_unlock(&inode_lock);
 
 	return res;
-- 
cgit v1.2.3


From f7899bd5472e8e99741369b4a32eca44e5282a85 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Oct 2010 07:09:06 -0400
Subject: fs: move i_count increments into find_inode/find_inode_fast

Now that iunique is not abusing find_inode anymore we can move the i_ref
increment back to where it belongs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index a8035e8576df..78c41c626cdc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -634,9 +634,6 @@ static struct shrinker icache_shrinker = {
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
  */
 static struct inode *find_inode(struct super_block *sb,
 				struct hlist_head *head,
@@ -656,9 +653,10 @@ repeat:
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
-		break;
+		__iget(inode);
+		return inode;
 	}
-	return node ? inode : NULL;
+	return NULL;
 }
 
 /*
@@ -681,9 +679,10 @@ repeat:
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
-		break;
+		__iget(inode);
+		return inode;
 	}
-	return node ? inode : NULL;
+	return NULL;
 }
 
 static inline void
@@ -828,7 +827,6 @@ static struct inode *get_new_inode(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		__iget(old);
 		spin_unlock(&inode_lock);
 		destroy_inode(inode);
 		inode = old;
@@ -875,7 +873,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		__iget(old);
 		spin_unlock(&inode_lock);
 		destroy_inode(inode);
 		inode = old;
@@ -989,7 +986,6 @@ static struct inode *ifind(struct super_block *sb,
 	spin_lock(&inode_lock);
 	inode = find_inode(sb, head, test, data);
 	if (inode) {
-		__iget(inode);
 		spin_unlock(&inode_lock);
 		if (likely(wait))
 			wait_on_inode(inode);
@@ -1022,7 +1018,6 @@ static struct inode *ifind_fast(struct super_block *sb,
 	spin_lock(&inode_lock);
 	inode = find_inode_fast(sb, head, ino);
 	if (inode) {
-		__iget(inode);
 		spin_unlock(&inode_lock);
 		wait_on_inode(inode);
 		return inode;
-- 
cgit v1.2.3


From 646ec4615cd05972581c9c5342ed7a1e77df17bb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Oct 2010 07:15:32 -0400
Subject: fs: remove inode_add_to_list/__inode_add_to_list

Split up inode_add_to_list/__inode_add_to_list.  Locking for the two
lists will be split soon so these helpers really don't buy us much
anymore.

The __ prefixes for the sb list helpers will go away soon, but until
inode_lock is gone we'll need them to distinguish between the locked
and unlocked variants.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c                  | 70 +++++++++++++++++++++------------------------
 fs/xfs/linux-2.6/xfs_iops.c |  4 ++-
 include/linux/fs.h          |  5 ++--
 3 files changed, 38 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 78c41c626cdc..430d70f2abe7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -336,6 +336,28 @@ static void inode_lru_list_del(struct inode *inode)
 	}
 }
 
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+}
+
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+	spin_lock(&inode_lock);
+	__inode_sb_list_add(inode);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+	list_del_init(&inode->i_sb_list);
+}
+
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
 {
 	unsigned long tmp;
@@ -356,9 +378,10 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
  */
 void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
-	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
+	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+
 	spin_lock(&inode_lock);
-	hlist_add_head(&inode->i_hash, head);
+	hlist_add_head(&inode->i_hash, b);
 	spin_unlock(&inode_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
@@ -436,7 +459,7 @@ static void dispose_list(struct list_head *head)
 
 		spin_lock(&inode_lock);
 		__remove_inode_hash(inode);
-		list_del_init(&inode->i_sb_list);
+		__inode_sb_list_del(inode);
 		spin_unlock(&inode_lock);
 
 		wake_up_inode(inode);
@@ -685,37 +708,6 @@ repeat:
 	return NULL;
 }
 
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-			struct inode *inode)
-{
-	list_add(&inode->i_sb_list, &sb->s_inodes);
-	if (head)
-		hlist_add_head(&inode->i_hash, head);
-}
-
-/**
- * inode_add_to_lists - add a new inode to relevant lists
- * @sb: superblock inode belongs to
- * @inode: inode to mark in use
- *
- * When an inode is allocated it needs to be accounted for, added to the in use
- * list, the owning superblock and the inode hash. This needs to be done under
- * the inode_lock, so export a function to do this rather than the inode lock
- * itself. We calculate the hash list to add to here so it is all internal
- * which requires the caller to have already set up the inode number in the
- * inode to add.
- */
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
-{
-	struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
-
-	spin_lock(&inode_lock);
-	__inode_add_to_lists(sb, head, inode);
-	spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
-
 /**
  *	new_inode 	- obtain an inode
  *	@sb: superblock
@@ -743,7 +735,7 @@ struct inode *new_inode(struct super_block *sb)
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode_lock);
-		__inode_add_to_lists(sb, NULL, inode);
+		__inode_sb_list_add(inode);
 		inode->i_ino = ++last_ino;
 		inode->i_state = 0;
 		spin_unlock(&inode_lock);
@@ -812,7 +804,8 @@ static struct inode *get_new_inode(struct super_block *sb,
 			if (set(inode, data))
 				goto set_failed;
 
-			__inode_add_to_lists(sb, head, inode);
+			hlist_add_head(&inode->i_hash, head);
+			__inode_sb_list_add(inode);
 			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
@@ -858,7 +851,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
 			inode->i_ino = ino;
-			__inode_add_to_lists(sb, head, inode);
+			hlist_add_head(&inode->i_hash, head);
+			__inode_sb_list_add(inode);
 			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
@@ -1318,7 +1312,7 @@ static void iput_final(struct inode *inode)
 	 */
 	inode_lru_list_del(inode);
 
-	list_del_init(&inode->i_sb_list);
+	__inode_sb_list_del(inode);
 	spin_unlock(&inode_lock);
 	evict(inode);
 	remove_inode_hash(inode);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ec858e09d546..71d83c93621c 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -760,7 +760,9 @@ xfs_setup_inode(
 
 	inode->i_ino = ip->i_ino;
 	inode->i_state = I_NEW;
-	inode_add_to_lists(ip->i_mount->m_super, inode);
+
+	inode_sb_list_add(inode);
+	insert_inode_hash(inode);
 
 	inode->i_mode	= ip->i_d.di_mode;
 	inode->i_nlink	= ip->i_d.di_nlink;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 876275fc0638..d43e8b6685a2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2171,7 +2171,6 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
-extern void inode_add_to_lists(struct super_block *, struct inode *);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
@@ -2202,9 +2201,11 @@ extern int file_remove_suid(struct file *);
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 extern void remove_inode_hash(struct inode *);
-static inline void insert_inode_hash(struct inode *inode) {
+static inline void insert_inode_hash(struct inode *inode)
+{
 	__insert_inode_hash(inode, inode->i_ino);
 }
+extern void inode_sb_list_add(struct inode *inode);
 
 #ifdef CONFIG_BLOCK
 extern void submit_bio(int, struct bio *);
-- 
cgit v1.2.3


From 7de9c6ee3ecffd99e1628e81a5ea5468f7581a1f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Oct 2010 11:11:40 -0400
Subject: new helper: ihold()

Clones an existing reference to inode; caller must already hold one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c           | 5 +++--
 fs/affs/inode.c             | 2 +-
 fs/afs/dir.c                | 2 +-
 fs/aio.c                    | 5 ++---
 fs/anon_inodes.c            | 5 ++---
 fs/bfs/dir.c                | 2 +-
 fs/block_dev.c              | 8 ++++----
 fs/btrfs/inode.c            | 2 +-
 fs/coda/dir.c               | 2 +-
 fs/exofs/namei.c            | 2 +-
 fs/ext2/namei.c             | 2 +-
 fs/ext3/namei.c             | 2 +-
 fs/ext4/namei.c             | 2 +-
 fs/gfs2/ops_inode.c         | 2 +-
 fs/hfsplus/dir.c            | 2 +-
 fs/inode.c                  | 9 +++++++++
 fs/jffs2/dir.c              | 4 ++--
 fs/jfs/jfs_txnmgr.c         | 2 +-
 fs/jfs/namei.c              | 2 +-
 fs/libfs.c                  | 2 +-
 fs/logfs/dir.c              | 2 +-
 fs/minix/namei.c            | 2 +-
 fs/namei.c                  | 2 +-
 fs/nfs/dir.c                | 2 +-
 fs/nfs/getroot.c            | 3 +--
 fs/nilfs2/namei.c           | 2 +-
 fs/ntfs/super.c             | 4 ++--
 fs/ocfs2/namei.c            | 2 +-
 fs/reiserfs/namei.c         | 2 +-
 fs/sysv/namei.c             | 2 +-
 fs/ubifs/dir.c              | 2 +-
 fs/udf/namei.c              | 2 +-
 fs/ufs/namei.c              | 2 +-
 fs/xfs/linux-2.6/xfs_iops.c | 2 +-
 fs/xfs/xfs_inode.h          | 2 +-
 include/linux/fs.h          | 1 +
 ipc/mqueue.c                | 2 +-
 kernel/futex.c              | 2 +-
 mm/shmem.c                  | 2 +-
 net/socket.c                | 2 +-
 40 files changed, 57 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..ef5905f7c8a3 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1789,9 +1789,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		kfree(st);
 	} else {
 		/* Caching disabled. No need to get upto date stat info.
-		 * This dentry will be released immediately. So, just i_count++
+		 * This dentry will be released immediately. So, just hold the
+		 * inode
 		 */
-		atomic_inc(&old_dentry->d_inode->i_count);
+		ihold(old_dentry->d_inode);
 	}
 
 	dentry->d_op = old_dentry->d_op;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
 		mark_buffer_dirty_inode(inode_bh, inode);
 		inode->i_nlink = 2;
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 	}
 	affs_fix_checksum(sb, bh);
 	mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..5439e1bc9a86 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	if (ret < 0)
 		goto link_error;
 
-	atomic_inc(&vnode->vfs_inode.i_count);
+	ihold(&vnode->vfs_inode);
 	d_instantiate(dentry, &vnode->vfs_inode);
 	key_put(key);
 	_leave(" = 0");
diff --git a/fs/aio.c b/fs/aio.c
index 9e319a04780e..8c8f6c5b6d79 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1553,10 +1553,9 @@ static void aio_batch_add(struct address_space *mapping,
 	 *
 	 * When we're called, we always have a reference
 	 * on the file, so we must always have a reference
-	 * on the inode, so igrab must always just
-	 * bump the count and move on.
+	 * on the inode, so ihold() is safe here.
 	 */
-	atomic_inc(&mapping->host->i_count);
+	ihold(mapping->host);
 	abe->mapping = mapping;
 	hlist_add_head(&abe->list, &batch_hash[bucket]);
 	return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..9c8e87b0361f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -111,10 +111,9 @@ struct file *anon_inode_getfile(const char *name,
 	path.mnt = mntget(anon_inode_mnt);
 	/*
 	 * We know the anon_inode inode count is always greater than zero,
-	 * so we can avoid doing an igrab() and we can use an open-coded
-	 * atomic_inc().
+	 * so ihold() is safe.
 	 */
-	atomic_inc(&anon_inode_inode->i_count);
+	ihold(anon_inode_inode);
 
 	path.dentry->d_op = &anon_inodefs_dentry_operations;
 	d_instantiate(path.dentry, anon_inode_inode);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(new, inode);
 	mutex_unlock(&info->bfs_lock);
 	return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b737451e2e9d..81972eb34b39 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -550,7 +550,7 @@ EXPORT_SYMBOL(bdget);
  */
 struct block_device *bdgrab(struct block_device *bdev)
 {
-	atomic_inc(&bdev->bd_inode->i_count);
+	ihold(bdev->bd_inode);
 	return bdev;
 }
 
@@ -580,7 +580,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
 	if (bdev) {
-		atomic_inc(&bdev->bd_inode->i_count);
+		ihold(bdev->bd_inode);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
@@ -591,12 +591,12 @@ static struct block_device *bd_acquire(struct inode *inode)
 		spin_lock(&bdev_lock);
 		if (!inode->i_bdev) {
 			/*
-			 * We take an additional bd_inode->i_count for inode,
+			 * We take an additional reference to bd_inode,
 			 * and it's released in clear_inode() of inode.
 			 * So, we can access it via ->i_mapping always
 			 * without igrab().
 			 */
-			atomic_inc(&bdev->bd_inode->i_count);
+			ihold(bdev->bd_inode);
 			inode->i_bdev = bdev;
 			inode->i_mapping = bdev->bd_inode->i_mapping;
 			list_add(&inode->i_devices, &bdev->bd_inodes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f6f2a0da2695..64f99cf69ce0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4758,7 +4758,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	btrfs_set_trans_block_group(trans, dir);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 96fbeab77f2f..5d8b35539601 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -276,7 +276,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	}
 
 	coda_dir_update_mtime(dir_inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(de, inode);
 	inc_nlink(inode);
 	return 0;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
 
 	inode->i_ctime = CURRENT_TIME;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	return exofs_add_nondir(dentry, inode);
 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..f8aecd2e3297 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = ext2_add_link(dentry, inode);
 	if (!err) {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..bce9dce639b8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@ retry:
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = ext3_add_entry(handle, dentry, inode);
 	if (!err) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..bd39885b5998 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2312,7 +2312,7 @@ retry:
 
 	inode->i_ctime = ext4_current_time(inode);
 	ext4_inc_count(handle, inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 48a274f1674c..12cbea7502c2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -255,7 +255,7 @@ out_parent:
 	gfs2_holder_uninit(ghs);
 	gfs2_holder_uninit(ghs + 1);
 	if (!error) {
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 		d_instantiate(dentry, inode);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85ec9d7..e318bbc0daf6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 
 	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 	sbi->file_count++;
diff --git a/fs/inode.c b/fs/inode.c
index 430d70f2abe7..05ea293d5f32 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -320,6 +320,15 @@ void __iget(struct inode *inode)
 	atomic_inc(&inode->i_count);
 }
 
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+
 static void inode_lru_list_add(struct inode *inode)
 {
 	if (list_empty(&inode->i_list)) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..79121aa5858b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 		mutex_unlock(&f->sem);
 		d_instantiate(dentry, old_dentry->d_inode);
 		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
-		atomic_inc(&old_dentry->d_inode->i_count);
+		ihold(old_dentry->d_inode);
 	}
 	return ret;
 }
@@ -864,7 +864,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
 		/* Might as well let the VFS know */
 		d_instantiate(new_dentry, old_dentry->d_inode);
-		atomic_inc(&old_dentry->d_inode->i_count);
+		ihold(old_dentry->d_inode);
 		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
 		return ret;
 	}
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 	 * lazy commit thread finishes processing
 	 */
 	if (tblk->xflag & COMMIT_DELETE) {
-		atomic_inc(&tblk->u.ip->i_count);
+		ihold(tblk->u.ip);
 		/*
 		 * Avoid a rare deadlock
 		 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..231ca4af9bce 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
 	ip->i_ctime = CURRENT_TIME;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dir);
-	atomic_inc(&ip->i_count);
+	ihold(ip);
 
 	iplist[0] = ip;
 	iplist[1] = dir;
diff --git a/fs/libfs.c b/fs/libfs.c
index 2dbf4877d7ef..304a5132ca27 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -255,7 +255,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	dget(dentry);
 	d_instantiate(dentry, inode);
 	return 0;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 1eb4e89e045b..409dfd65e9a1 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -EMLINK;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	inode->i_nlink++;
 	mark_inode_dirty_sync(inode);
 
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..c0d35a3accef 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	return add_nondir(dentry, inode);
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index f1ef97dbc6c4..f7dbc06857ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2285,7 +2285,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 			goto slashes;
 		inode = dentry->d_inode;
 		if (inode)
-			atomic_inc(&inode->i_count);
+			ihold(inode);
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit2;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..0fac7fea18ef 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1580,7 +1580,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 	d_drop(dentry);
 	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
 	if (error == 0) {
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 		d_add(dentry, inode);
 	}
 	return error;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..ac7b814ce162 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 			iput(inode);
 			return -ENOMEM;
 		}
-		/* Circumvent igrab(): we know the inode is not being freed */
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 		/*
 		 * Ensure that this dentry is invisible to d_find_alias().
 		 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 185d1607cb00..6e9557ecf161 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -207,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	inode->i_ctime = CURRENT_TIME;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = nilfs_add_nondir(dentry, inode);
 	if (!err)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d4dec2d32117..d3fbe5730bfc 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2911,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto unl_upcase_iput_tmp_ino_err_out_now;
 	}
 	if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-		/* We increment i_count simulating an ntfs_iget(). */
-		atomic_inc(&vol->root_ino->i_count);
+		/* We grab a reference, simulating an ntfs_iget(). */
+		ihold(vol->root_ino);
 		ntfs_debug("Exiting, status successful.");
 		/* Release the default upcase if it has no users. */
 		mutex_lock(&ntfs_lock);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e7bde21149ae..ff5744e1e36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_commit;
 	}
 
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	inode->i_ctime = CURRENT_TIME_SEC;
 	reiserfs_update_sd(&th, inode);
 
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(dentry, inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 	reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..11e7f7d11cd0 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	return add_nondir(dentry, inode);
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 
 	lock_2_inodes(dir, inode);
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	inode->i_ctime = ubifs_current_time(inode);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..6d8dc02baebb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	inc_nlink(inode);
 	inode->i_ctime = current_fs_time(inode->i_sb);
 	mark_inode_dirty(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(dentry, inode);
 	unlock_kernel();
 
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	error = ufs_add_nondir(dentry, inode);
 	unlock_kernel();
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 71d83c93621c..96107efc0c61 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -317,7 +317,7 @@ xfs_vn_link(
 	if (unlikely(error))
 		return -error;
 
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(dentry, inode);
 	return 0;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fac52290de90..fb2ca2e4cdc9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,7 +500,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
 	ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-	atomic_inc(&(VFS_I(ip)->i_count)); \
+	ihold(VFS_I(ip)); \
 	trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d43e8b6685a2..bd6ae6c71fc8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2171,6 +2171,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index e1e7b9635f5d..80b35ffca25d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -769,7 +769,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 
 	inode = dentry->d_inode;
 	if (inode)
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 	err = mnt_want_write(ipc_ns->mq_mnt);
 	if (err)
 		goto out_err;
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf160e0b..6c683b37f2ce 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
 
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
-		atomic_inc(&key->shared.inode->i_count);
+		ihold(key->shared.inode);
 		break;
 	case FUT_OFF_MMSHARED:
 		atomic_inc(&key->private.mm->mm_count);
diff --git a/mm/shmem.c b/mm/shmem.c
index 27a58120dbd5..d4e2852526e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1903,7 +1903,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);	/* New dentry reference */
+	ihold(inode);	/* New dentry reference */
 	dget(dentry);		/* Extra pinning count for the created dentry */
 	d_instantiate(dentry, inode);
 out:
diff --git a/net/socket.c b/net/socket.c
index abf3e2561521..d223725f99e5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -377,7 +377,7 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
 		  &socket_file_ops);
 	if (unlikely(!file)) {
 		/* drop dentry, keep inode */
-		atomic_inc(&path.dentry->d_inode->i_count);
+		ihold(path.dentry->d_inode);
 		path_put(&path);
 		put_unused_fd(fd);
 		return -ENFILE;
-- 
cgit v1.2.3


From f991bd2e14210fb93d722cb23e54991de20e8a3d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 23 Oct 2010 11:18:01 -0400
Subject: fs: introduce a per-cpu last_ino allocator

new_inode() dirties a contended cache line to get increasing
inode numbers. This limits performance on workloads that cause
significant parallel inode allocation.

Solve this problem by using a per_cpu variable fed by the shared
last_ino in batches of 1024 allocations.  This reduces contention on
the shared last_ino, and give same spreading ino numbers than before
(i.e. same wraparound after 2^32 allocations).

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 05ea293d5f32..46a3e120b196 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -717,6 +717,43 @@ repeat:
 	return NULL;
 }
 
+/*
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+ * to renew the exhausted range.
+ *
+ * This does not significantly increase overflow rate because every CPU can
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+ * overflow rate by 2x, which does not seem too significant.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
+ */
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+
+static unsigned int get_next_ino(void)
+{
+	unsigned int *p = &get_cpu_var(last_ino);
+	unsigned int res = *p;
+
+#ifdef CONFIG_SMP
+	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+		static atomic_t shared_last_ino;
+		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+
+		res = next - LAST_INO_BATCH;
+	}
+#endif
+
+	*p = ++res;
+	put_cpu_var(last_ino);
+	return res;
+}
+
 /**
  *	new_inode 	- obtain an inode
  *	@sb: superblock
@@ -731,12 +768,6 @@ repeat:
  */
 struct inode *new_inode(struct super_block *sb)
 {
-	/*
-	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-	 * error if st_ino won't fit in target struct field. Use 32bit counter
-	 * here to attempt to avoid that.
-	 */
-	static unsigned int last_ino;
 	struct inode *inode;
 
 	spin_lock_prefetch(&inode_lock);
@@ -745,7 +776,7 @@ struct inode *new_inode(struct super_block *sb)
 	if (inode) {
 		spin_lock(&inode_lock);
 		__inode_sb_list_add(inode);
-		inode->i_ino = ++last_ino;
+		inode->i_ino = get_next_ino();
 		inode->i_state = 0;
 		spin_unlock(&inode_lock);
 	}
-- 
cgit v1.2.3


From 85fe4025c616a7c0ed07bc2fc8c5371b07f3888c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Oct 2010 11:19:54 -0400
Subject: fs: do not assign default i_ino in new_inode

Instead of always assigning an increasing inode number in new_inode
move the call to assign it into those callers that actually need it.
For now callers that need it is estimated conservatively, that is
the call is added to all filesystems that do not assign an i_ino
by themselves.  For a few more filesystems we can avoid assigning
any inode number given that they aren't user visible, and for others
it could be done lazily when an inode number is actually needed,
but that's left for later patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/infiniband/hw/ipath/ipath_fs.c | 1 +
 drivers/infiniband/hw/qib/qib_fs.c     | 1 +
 drivers/misc/ibmasm/ibmasmfs.c         | 1 +
 drivers/oprofile/oprofilefs.c          | 1 +
 drivers/usb/core/inode.c               | 1 +
 drivers/usb/gadget/f_fs.c              | 1 +
 drivers/usb/gadget/inode.c             | 1 +
 fs/anon_inodes.c                       | 1 +
 fs/autofs4/inode.c                     | 1 +
 fs/binfmt_misc.c                       | 1 +
 fs/configfs/inode.c                    | 1 +
 fs/debugfs/inode.c                     | 1 +
 fs/ext4/mballoc.c                      | 1 +
 fs/freevxfs/vxfs_inode.c               | 1 +
 fs/fuse/control.c                      | 1 +
 fs/hugetlbfs/inode.c                   | 1 +
 fs/inode.c                             | 4 ++--
 fs/ocfs2/dlmfs/dlmfs.c                 | 2 ++
 fs/pipe.c                              | 2 ++
 fs/proc/base.c                         | 2 ++
 fs/proc/proc_sysctl.c                  | 2 ++
 fs/ramfs/inode.c                       | 1 +
 fs/xfs/linux-2.6/xfs_buf.c             | 1 +
 include/linux/fs.h                     | 1 +
 ipc/mqueue.c                           | 1 +
 kernel/cgroup.c                        | 1 +
 mm/shmem.c                             | 1 +
 net/socket.c                           | 1 +
 net/sunrpc/rpc_pipe.c                  | 1 +
 security/inode.c                       | 1 +
 security/selinux/selinuxfs.c           | 1 +
 31 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index d13e72685dcf..12d5bf76302c 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -57,6 +57,7 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto bail;
 	}
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_private = data;
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index a0e6613e8be6..7e433d75c775 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -58,6 +58,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto bail;
 	}
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index af2497ae5fe3..0a53500636c9 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -146,6 +146,7 @@ static struct inode *ibmasmfs_make_inode(struct super_block *sb, int mode)
 	struct inode *ret = new_inode(sb);
 
 	if (ret) {
+		ret->i_ino = get_next_ino();
 		ret->i_mode = mode;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
 	}
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 95f711b251ad..449de59bf35b 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -28,6 +28,7 @@ static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode)
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	}
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 095fa5366690..e2f63c0ea09d 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -276,6 +276,7 @@ static struct inode *usbfs_get_inode (struct super_block *sb, int mode, dev_t de
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c
index e4f595055208..e093fd8d04d3 100644
--- a/drivers/usb/gadget/f_fs.c
+++ b/drivers/usb/gadget/f_fs.c
@@ -980,6 +980,7 @@ ffs_sb_make_inode(struct super_block *sb, void *data,
 	if (likely(inode)) {
 		struct timespec current_time = CURRENT_TIME;
 
+		inode->i_ino	 = usbfs_get_inode();
 		inode->i_mode    = perms->mode;
 		inode->i_uid     = perms->uid;
 		inode->i_gid     = perms->gid;
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index d1d72d946b04..ba145e7fbe03 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1991,6 +1991,7 @@ gadgetfs_make_inode (struct super_block *sb,
 	struct inode *inode = new_inode (sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = default_uid;
 		inode->i_gid = default_gid;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9c8e87b0361f..5365527ca43f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -193,6 +193,7 @@ static struct inode *anon_inode_mkinode(void)
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	inode->i_ino = get_next_ino();
 	inode->i_fop = &anon_inode_fops;
 
 	inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..ac87e49fa706 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 		inode->i_gid = sb->s_root->d_inode->i_gid;
 	}
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_ino = get_next_ino();
 
 	if (S_ISDIR(inf->mode)) {
 		inode->i_nlink = 2;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 139fc8083f53..29990f0eee0c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	struct inode * inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 			current_fs_time(inode->i_sb);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..253476d78ed8 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..a4ed8380e98a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 19aa0d44d822..42f77b1dc72d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2373,6 +2373,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
 		goto err_freesgi;
 	}
+	sbi->s_buddy_cache->i_ino = get_next_ino();
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
 	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..8c04eac5079d 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
 	struct inode			*ip = NULL;
 
 	if ((ip = new_inode(sbp))) {
+		ip->i_ino = get_next_ino();
 		vxfs_iinit(ip, vip);
 		ip->i_mapping->a_ops = &vxfs_aops;
 	}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 7367e177186f..4eba07661e5c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -222,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 	if (!inode)
 		return NULL;
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_uid = fc->user_id;
 	inode->i_gid = fc->group_id;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 113eba3d3c38..8d0607b37266 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -455,6 +455,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 	inode = new_inode(sb);
 	if (inode) {
 		struct hugetlbfs_inode_info *info;
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
diff --git a/fs/inode.c b/fs/inode.c
index 46a3e120b196..2cd2e48f7a20 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -735,7 +735,7 @@ repeat:
 #define LAST_INO_BATCH 1024
 static DEFINE_PER_CPU(unsigned int, last_ino);
 
-static unsigned int get_next_ino(void)
+unsigned int get_next_ino(void)
 {
 	unsigned int *p = &get_cpu_var(last_ino);
 	unsigned int res = *p;
@@ -753,6 +753,7 @@ static unsigned int get_next_ino(void)
 	put_cpu_var(last_ino);
 	return res;
 }
+EXPORT_SYMBOL(get_next_ino);
 
 /**
  *	new_inode 	- obtain an inode
@@ -776,7 +777,6 @@ struct inode *new_inode(struct super_block *sb)
 	if (inode) {
 		spin_lock(&inode_lock);
 		__inode_sb_list_add(inode);
-		inode->i_ino = get_next_ino();
 		inode->i_state = 0;
 		spin_unlock(&inode_lock);
 	}
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index a7ebd9d42dc8..75e115f1bd73 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		ip = DLMFS_I(inode);
 
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	if (!inode)
 		return NULL;
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
diff --git a/fs/pipe.c b/fs/pipe.c
index 37eb1ebeaa90..d2d7566ce68e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
 	if (!inode)
 		goto fail_inode;
 
+	inode->i_ino = get_next_ino();
+
 	pipe = alloc_pipe_info(inode);
 	if (!pipe)
 		goto fail_iput;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fb2a5abd4e4f..9883f1e18332 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1603,6 +1603,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 
 	/* Common stuff */
 	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &proc_def_inode_operations;
 
@@ -2549,6 +2550,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 
 	/* Initialize the inode */
 	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 
 	/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 2fc52552271d..b652cb00906b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	if (!inode)
 		goto out;
 
+	inode->i_ino = get_next_ino();
+
 	sysctl_head_get(head);
 	ei = PROC_I(inode);
 	ei->sysctl = head;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..67fadb1ad2c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 	struct inode * inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba5312802aa9..63fd2c07cb57 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1580,6 +1580,7 @@ xfs_mapping_buftarg(
 			XFS_BUFTARG_NAME(btp));
 		return ENOMEM;
 	}
+	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFBLK;
 	inode->i_bdev = bdev;
 	inode->i_rdev = bdev->bd_dev;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bd6ae6c71fc8..4a573cf13f51 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2191,6 +2191,7 @@ extern struct inode * iget_locked(struct super_block *, unsigned long);
 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
 extern void unlock_new_inode(struct inode *);
+extern unsigned int get_next_ino(void);
 
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 80b35ffca25d..3a61ffefe884 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -116,6 +116,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 
 	inode = new_inode(sb);
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7b69b8d0313d..9270d532ec3c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -777,6 +777,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e2852526e6..f6d350e8adc5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 	inode = new_inode(sb);
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
diff --git a/net/socket.c b/net/socket.c
index d223725f99e5..5cac1c707755 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -480,6 +480,7 @@ static struct socket *sock_alloc(void)
 	sock = SOCKET_I(inode);
 
 	kmemcheck_annotate_bitfield(sock, type);
+	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 52f252432144..7df92d237cb8 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -445,6 +445,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
 	struct inode *inode = new_inode(sb);
 	if (!inode)
 		return NULL;
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch(mode & S_IFMT) {
diff --git a/security/inode.c b/security/inode.c
index 88839866cbcd..cb8f47c66a58 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -61,6 +61,7 @@ static struct inode *get_inode(struct super_block *sb, int mode, dev_t dev)
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 87e0556bae70..55a755c1a1bd 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -978,6 +978,7 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode)
 	struct inode *ret = new_inode(sb);
 
 	if (ret) {
+		ret->i_ino = get_next_ino();
 		ret->i_mode = mode;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
 	}
-- 
cgit v1.2.3


From be148247cfbe2422f5709e77d9c3e10b8a6394da Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:21 -0400
Subject: fs: take dcache_lock inside __d_path

All callers take dcache_lock just around the call to __d_path, so
take the lock into it in preparation of getting rid of dcache_lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c                | 6 ++++--
 fs/seq_file.c              | 2 --
 security/apparmor/path.c   | 2 --
 security/tomoyo/realpath.c | 2 --
 4 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..54f93f5e6b0f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1994,7 +1994,7 @@ global_root:
  * Returns a pointer into the buffer or an error code if the
  * path was too long.
  *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
  *
  * If path is not reachable from the supplied root, then the value of
  * root is changed (without modifying refcounts).
@@ -2006,10 +2006,12 @@ char *__d_path(const struct path *path, struct path *root,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
+	spin_lock(&dcache_lock);
 	error = prepend_path(path, root, &res, &buflen);
+	spin_unlock(&dcache_lock);
+
 	if (error)
 		return ERR_PTR(error);
-
 	return res;
 }
 
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 0e7cb1395a94..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
 	if (size) {
 		char *p;
 
-		spin_lock(&dcache_lock);
 		p = __d_path(path, root, buf, size);
-		spin_unlock(&dcache_lock);
 		res = PTR_ERR(p);
 		if (!IS_ERR(p)) {
 			char *end = mangle_path(buf, p, esc);
diff --git a/security/apparmor/path.c b/security/apparmor/path.c
index 82396050f186..36cc0cc39e78 100644
--- a/security/apparmor/path.c
+++ b/security/apparmor/path.c
@@ -72,10 +72,8 @@ static int d_namespace_path(struct path *path, char *buf, int buflen,
 		path_get(&root);
 	}
 
-	spin_lock(&dcache_lock);
 	tmp = root;
 	res = __d_path(path, &tmp, buf, buflen);
-	spin_unlock(&dcache_lock);
 
 	*name = res;
 	/* handle error conditions - and still allow a partial path to
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index ed8ccd680102..1d0bf8fa1922 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -127,10 +127,8 @@ char *tomoyo_realpath_from_path(struct path *path)
 		/* If we don't have a vfsmount, we can't calculate. */
 		if (!path->mnt)
 			break;
-		spin_lock(&dcache_lock);
 		/* go to whatever namespace root we are under */
 		pos = __d_path(path, &ns_root, buf, buf_len);
-		spin_unlock(&dcache_lock);
 		/* Prepend "/proc" prefix if using internal proc vfs mount. */
 		if (!IS_ERR(pos) && (path->mnt->mnt_flags & MNT_INTERNAL) &&
 		    (path->mnt->mnt_sb->s_magic == PROC_SUPER_MAGIC)) {
-- 
cgit v1.2.3


From 9c82ab9c9e16cb9edf17bd0d31f3d6904afce04f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:22 -0400
Subject: fs: simplify __d_free

Remove d_callback and always call __d_free with a RCU head.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 54f93f5e6b0f..028753951e95 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,20 +67,16 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static void __d_free(struct dentry *dentry)
+static void __d_free(struct rcu_head *head)
 {
+	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+
 	WARN_ON(!list_empty(&dentry->d_alias));
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
-static void d_callback(struct rcu_head *head)
-{
-	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-	__d_free(dentry);
-}
-
 /*
  * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
  * inside dcache_lock.
@@ -91,9 +87,9 @@ static void d_free(struct dentry *dentry)
 		dentry->d_op->d_release(dentry);
 	/* if dentry was never inserted into hash, immediate free is OK */
 	if (hlist_unhashed(&dentry->d_hash))
-		__d_free(dentry);
+		__d_free(&dentry->d_u.d_rcu);
 	else
-		call_rcu(&dentry->d_u.d_rcu, d_callback);
+		call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 
 /*
-- 
cgit v1.2.3


From 312d3ca856d369bb04d0443846b85b4cdde6fa8a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:23 -0400
Subject: fs: use percpu counter for nr_dentry and nr_dentry_unused

The nr_dentry stat is a globally touched cacheline and atomic operation
twice over the lifetime of a dentry. It is used for the benfit of userspace
only. Turn it into a per-cpu counter and always decrement it in d_free instead
of doing various batching operations to reduce lock hold times in the callers.

Based on an earlier patch from Nick Piggin <npiggin@suse.de>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c        | 51 ++++++++++++++++++++++++++++++++-------------------
 include/linux/fs.h |  2 ++
 kernel/sysctl.c    |  2 +-
 3 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 028753951e95..c37a656802b0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,6 +67,19 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
+static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+		   size_t *lenp, loff_t *ppos)
+{
+	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
 static void __d_free(struct rcu_head *head)
 {
 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
@@ -78,13 +91,14 @@ static void __d_free(struct rcu_head *head)
 }
 
 /*
- * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
- * inside dcache_lock.
+ * no dcache_lock, please.
  */
 static void d_free(struct dentry *dentry)
 {
+	percpu_counter_dec(&nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
+
 	/* if dentry was never inserted into hash, immediate free is OK */
 	if (hlist_unhashed(&dentry->d_hash))
 		__d_free(&dentry->d_u.d_rcu);
@@ -125,14 +139,14 @@ static void dentry_lru_add(struct dentry *dentry)
 {
 	list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	dentry->d_sb->s_nr_dentry_unused++;
-	dentry_stat.nr_unused++;
+	percpu_counter_inc(&nr_dentry_unused);
 }
 
 static void dentry_lru_add_tail(struct dentry *dentry)
 {
 	list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	dentry->d_sb->s_nr_dentry_unused++;
-	dentry_stat.nr_unused++;
+	percpu_counter_inc(&nr_dentry_unused);
 }
 
 static void dentry_lru_del(struct dentry *dentry)
@@ -140,7 +154,7 @@ static void dentry_lru_del(struct dentry *dentry)
 	if (!list_empty(&dentry->d_lru)) {
 		list_del(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		dentry_stat.nr_unused--;
+		percpu_counter_dec(&nr_dentry_unused);
 	}
 }
 
@@ -149,7 +163,7 @@ static void dentry_lru_del_init(struct dentry *dentry)
 	if (likely(!list_empty(&dentry->d_lru))) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		dentry_stat.nr_unused--;
+		percpu_counter_dec(&nr_dentry_unused);
 	}
 }
 
@@ -168,7 +182,6 @@ static struct dentry *d_kill(struct dentry *dentry)
 	struct dentry *parent;
 
 	list_del(&dentry->d_u.d_child);
-	dentry_stat.nr_dentry--;	/* For d_free, below */
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
 	if (IS_ROOT(dentry))
@@ -314,7 +327,6 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 
 /* This should be called _only_ with dcache_lock held */
-
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
@@ -534,7 +546,7 @@ static void prune_dcache(int count)
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = dentry_stat.nr_unused;
+	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
 	int prune_ratio;
 	int pruned;
 
@@ -699,20 +711,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			 * otherwise we ascend to the parent and move to the
 			 * next sibling if there is one */
 			if (!parent)
-				goto out;
-
+				return;
 			dentry = parent;
-
 		} while (list_empty(&dentry->d_subdirs));
 
 		dentry = list_entry(dentry->d_subdirs.next,
 				    struct dentry, d_u.d_child);
 	}
-out:
-	/* several dentries were freed, need to correct nr_dentry */
-	spin_lock(&dcache_lock);
-	dentry_stat.nr_dentry -= detached;
-	spin_unlock(&dcache_lock);
 }
 
 /*
@@ -896,12 +901,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
  */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+	int nr_unused;
+
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 		prune_dcache(nr);
 	}
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+
+	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker dcache_shrinker = {
@@ -968,9 +977,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	spin_lock(&dcache_lock);
 	if (parent)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
+	percpu_counter_inc(&nr_dentry);
+
 	return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
@@ -2417,6 +2427,9 @@ static void __init dcache_init(void)
 {
 	int loop;
 
+	percpu_counter_init(&nr_dentry, 0);
+	percpu_counter_init(&nr_dentry_unused, 0);
+
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a573cf13f51..d58059944801 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2490,6 +2490,8 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
+int proc_nr_dentry(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
 int proc_nr_inodes(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp, loff_t *ppos);
 int __init get_filesystem_list(char *buf);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a510cbfbb3..8b77ff5c502c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1377,7 +1377,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &dentry_stat,
 		.maxlen		= 6*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_nr_dentry,
 	},
 	{
 		.procname	= "overflowuid",
-- 
cgit v1.2.3


From 265ac90230257e9c035e4b0c63a0c11c5336e93c Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sun, 10 Oct 2010 05:36:24 -0400
Subject: fs: improve DCACHE_REFERENCED usage

dentry referenced bit is only set when installing the dentry back
onto the LRU. However with lazy LRU, the dentry can already be on
the LRU list at dput time, thus missing out on setting the referenced
bit. Fix this.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c37a656802b0..1a976d4efbe1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -246,13 +246,16 @@ repeat:
 		if (dentry->d_op->d_delete(dentry))
 			goto unhash_it;
 	}
+
 	/* Unreachable? Get rid of it */
  	if (d_unhashed(dentry))
 		goto kill_it;
-  	if (list_empty(&dentry->d_lru)) {
-  		dentry->d_flags |= DCACHE_REFERENCED;
+
+	/* Otherwise leave it cached and ensure it's on the LRU */
+	dentry->d_flags |= DCACHE_REFERENCED;
+	if (list_empty(&dentry->d_lru))
 		dentry_lru_add(dentry);
-  	}
+
  	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
-- 
cgit v1.2.3


From 3049cfe24ef3872ba74f90630356722cf988b80d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:25 -0400
Subject: fs: split __shrink_dcache_sb

Currently __shrink_dcache_sb has an extremly awkward calling convention
because it tries to please very different callers.  Split out the
main loop into a shrink_dentry_list helper, which gets called directly
from shrink_dcache_sb for the cases where all dentries need to be pruned,
or from __shrink_dcache_sb for pruning only a certain number of dentries.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 127 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 1a976d4efbe1..e987ad576a39 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -459,66 +459,20 @@ static void prune_one_dentry(struct dentry * dentry)
 	}
 }
 
-/*
- * Shrink the dentry LRU on a given superblock.
- * @sb   : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+static void shrink_dentry_list(struct list_head *list)
 {
-	LIST_HEAD(referenced);
-	LIST_HEAD(tmp);
 	struct dentry *dentry;
-	int cnt = 0;
 
-	BUG_ON(!sb);
-	BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
-	spin_lock(&dcache_lock);
-	if (count != NULL)
-		/* called from prune_dcache() and shrink_dcache_parent() */
-		cnt = *count;
-restart:
-	if (count == NULL)
-		list_splice_init(&sb->s_dentry_lru, &tmp);
-	else {
-		while (!list_empty(&sb->s_dentry_lru)) {
-			dentry = list_entry(sb->s_dentry_lru.prev,
-					struct dentry, d_lru);
-			BUG_ON(dentry->d_sb != sb);
-
-			spin_lock(&dentry->d_lock);
-			/*
-			 * If we are honouring the DCACHE_REFERENCED flag and
-			 * the dentry has this flag set, don't free it. Clear
-			 * the flag and put it back on the LRU.
-			 */
-			if ((flags & DCACHE_REFERENCED)
-				&& (dentry->d_flags & DCACHE_REFERENCED)) {
-				dentry->d_flags &= ~DCACHE_REFERENCED;
-				list_move(&dentry->d_lru, &referenced);
-				spin_unlock(&dentry->d_lock);
-			} else {
-				list_move_tail(&dentry->d_lru, &tmp);
-				spin_unlock(&dentry->d_lock);
-				cnt--;
-				if (!cnt)
-					break;
-			}
-			cond_resched_lock(&dcache_lock);
-		}
-	}
-	while (!list_empty(&tmp)) {
-		dentry = list_entry(tmp.prev, struct dentry, d_lru);
+	while (!list_empty(list)) {
+		dentry = list_entry(list->prev, struct dentry, d_lru);
 		dentry_lru_del_init(dentry);
-		spin_lock(&dentry->d_lock);
+
 		/*
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup.  Do not free
 		 * it - just keep it off the LRU list.
 		 */
+		spin_lock(&dentry->d_lock);
 		if (atomic_read(&dentry->d_count)) {
 			spin_unlock(&dentry->d_lock);
 			continue;
@@ -527,13 +481,60 @@ restart:
 		/* dentry->d_lock was dropped in prune_one_dentry() */
 		cond_resched_lock(&dcache_lock);
 	}
-	if (count == NULL && !list_empty(&sb->s_dentry_lru))
-		goto restart;
-	if (count != NULL)
-		*count = cnt;
+}
+
+/**
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:		superblock to shrink dentry LRU.
+ * @count:	number of entries to prune
+ * @flags:	flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+	/* called from prune_dcache() and shrink_dcache_parent() */
+	struct dentry *dentry;
+	LIST_HEAD(referenced);
+	LIST_HEAD(tmp);
+	int cnt = *count;
+
+	spin_lock(&dcache_lock);
+	while (!list_empty(&sb->s_dentry_lru)) {
+		dentry = list_entry(sb->s_dentry_lru.prev,
+				struct dentry, d_lru);
+		BUG_ON(dentry->d_sb != sb);
+
+		/*
+		 * If we are honouring the DCACHE_REFERENCED flag and the
+		 * dentry has this flag set, don't free it.  Clear the flag
+		 * and put it back on the LRU.
+		 */
+		if (flags & DCACHE_REFERENCED) {
+			spin_lock(&dentry->d_lock);
+			if (dentry->d_flags & DCACHE_REFERENCED) {
+				dentry->d_flags &= ~DCACHE_REFERENCED;
+				list_move(&dentry->d_lru, &referenced);
+				spin_unlock(&dentry->d_lock);
+				cond_resched_lock(&dcache_lock);
+				continue;
+			}
+			spin_unlock(&dentry->d_lock);
+		}
+
+		list_move_tail(&dentry->d_lru, &tmp);
+		if (!--cnt)
+			break;
+		cond_resched_lock(&dcache_lock);
+	}
+
+	*count = cnt;
+	shrink_dentry_list(&tmp);
+
 	if (!list_empty(&referenced))
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lock);
+
 }
 
 /**
@@ -619,13 +620,19 @@ static void prune_dcache(int count)
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
  *
- * Shrink the dcache for the specified super block. This
- * is used to free the dcache before unmounting a file
- * system
+ * Shrink the dcache for the specified super block. This is used to free
+ * the dcache before unmounting a file system.
  */
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
 {
-	__shrink_dcache_sb(sb, NULL, 0);
+	LIST_HEAD(tmp);
+
+	spin_lock(&dcache_lock);
+	while (!list_empty(&sb->s_dentry_lru)) {
+		list_splice_init(&sb->s_dentry_lru, &tmp);
+		shrink_dentry_list(&tmp);
+	}
+	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
-- 
cgit v1.2.3


From a4633357ac610cd2f8740e28a31fc148a7960421 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:26 -0400
Subject: fs: clean up dentry lru modification

Always do a list_del_init on the LRU to make sure the list_empty invariant for
not beeing on the LRU always holds true, and fold dentry_lru_del_init into
dentry_lru_del.  Replace the dentry_lru_add_tail primitive with a
dentry_lru_move_tail operations that simpler when the dentry already is one
the list, which is always is.  Move the list_empty into dentry_lru_add to
fit the scheme of the other lru helpers, and simplify locking once we
move to a separate LRU lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index e987ad576a39..cc2b93802179 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -133,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
  */
 static void dentry_lru_add(struct dentry *dentry)
 {
-	list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-	dentry->d_sb->s_nr_dentry_unused++;
-	percpu_counter_inc(&nr_dentry_unused);
-}
-
-static void dentry_lru_add_tail(struct dentry *dentry)
-{
-	list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-	dentry->d_sb->s_nr_dentry_unused++;
-	percpu_counter_inc(&nr_dentry_unused);
+	if (list_empty(&dentry->d_lru)) {
+		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		dentry->d_sb->s_nr_dentry_unused++;
+		percpu_counter_inc(&nr_dentry_unused);
+	}
 }
 
 static void dentry_lru_del(struct dentry *dentry)
 {
 	if (!list_empty(&dentry->d_lru)) {
-		list_del(&dentry->d_lru);
+		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
 		percpu_counter_dec(&nr_dentry_unused);
 	}
 }
 
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
 {
-	if (likely(!list_empty(&dentry->d_lru))) {
-		list_del_init(&dentry->d_lru);
-		dentry->d_sb->s_nr_dentry_unused--;
-		percpu_counter_dec(&nr_dentry_unused);
+	if (list_empty(&dentry->d_lru)) {
+		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		dentry->d_sb->s_nr_dentry_unused++;
+		percpu_counter_inc(&nr_dentry_unused);
+	} else {
+		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
 }
 
@@ -253,8 +250,7 @@ repeat:
 
 	/* Otherwise leave it cached and ensure it's on the LRU */
 	dentry->d_flags |= DCACHE_REFERENCED;
-	if (list_empty(&dentry->d_lru))
-		dentry_lru_add(dentry);
+	dentry_lru_add(dentry);
 
  	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
@@ -333,7 +329,7 @@ EXPORT_SYMBOL(d_invalidate);
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
-	dentry_lru_del_init(dentry);
+	dentry_lru_del(dentry);
 	return dentry;
 }
 
@@ -452,7 +448,7 @@ static void prune_one_dentry(struct dentry * dentry)
 
 		if (dentry->d_op && dentry->d_op->d_delete)
 			dentry->d_op->d_delete(dentry);
-		dentry_lru_del_init(dentry);
+		dentry_lru_del(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
 		spin_lock(&dcache_lock);
@@ -465,7 +461,7 @@ static void shrink_dentry_list(struct list_head *list)
 
 	while (!list_empty(list)) {
 		dentry = list_entry(list->prev, struct dentry, d_lru);
-		dentry_lru_del_init(dentry);
+		dentry_lru_del(dentry);
 
 		/*
 		 * We found an inuse dentry which was not removed from
@@ -650,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 	/* detach this root from the system */
 	spin_lock(&dcache_lock);
-	dentry_lru_del_init(dentry);
+	dentry_lru_del(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 
@@ -664,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			spin_lock(&dcache_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
-				dentry_lru_del_init(loop);
+				dentry_lru_del(loop);
 				__d_drop(loop);
 				cond_resched_lock(&dcache_lock);
 			}
@@ -841,14 +837,15 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
-		dentry_lru_del_init(dentry);
 		/* 
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			dentry_lru_add_tail(dentry);
+			dentry_lru_move_tail(dentry);
 			found++;
+		} else {
+			dentry_lru_del(dentry);
 		}
 
 		/*
-- 
cgit v1.2.3


From 3825bdb7ed920845961f32f364454bee5f469abb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:27 -0400
Subject: fs: use RCU read side protection in d_validate

d_validate does a purely read lookup in the dentry hash, so use RCU read side
locking instead of dcache_lock.  Split out from a larget patch by
Nick Piggin <npiggin@suse.de>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index cc2b93802179..23702a9d4e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1491,33 +1491,26 @@ out:
  * This is used by ncpfs in its readdir implementation.
  * Zero is returned in the dentry is invalid.
  */
- 
-int d_validate(struct dentry *dentry, struct dentry *dparent)
+int d_validate(struct dentry *dentry, struct dentry *parent)
 {
-	struct hlist_head *base;
-	struct hlist_node *lhp;
+	struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
+	struct hlist_node *node;
+	struct dentry *d;
 
 	/* Check whether the ptr might be valid at all.. */
 	if (!kmem_ptr_validate(dentry_cache, dentry))
-		goto out;
-
-	if (dentry->d_parent != dparent)
-		goto out;
+		return 0;
+	if (dentry->d_parent != parent)
+		return 0;
 
-	spin_lock(&dcache_lock);
-	base = d_hash(dparent, dentry->d_name.hash);
-	hlist_for_each(lhp,base) { 
-		/* hlist_for_each_entry_rcu() not required for d_hash list
-		 * as it is parsed under dcache_lock
-		 */
-		if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
-			__dget_locked(dentry);
-			spin_unlock(&dcache_lock);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, node, head, d_hash) {
+		if (d == dentry) {
+			dget(dentry);
 			return 1;
 		}
 	}
-	spin_unlock(&dcache_lock);
-out:
+	rcu_read_unlock();
 	return 0;
 }
 EXPORT_SYMBOL(d_validate);
-- 
cgit v1.2.3


From 0461ee2616252f1f6cec628990fa913a4282dcf7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Oct 2010 11:56:37 -0400
Subject: exportfs: use dget_parent

Use dget_parent instead of opencoding it.  This simplifies the code, but
more importanly prepares for the more complicated locking for a parent
dget in the dcache scale patch series.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..51b304056f10 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
 find_disconnected_root(struct dentry *dentry)
 {
 	dget(dentry);
-	spin_lock(&dentry->d_lock);
-	while (!IS_ROOT(dentry) &&
-	       (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-		struct dentry *parent = dentry->d_parent;
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dget_parent(dentry);
+
+		if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
+			dput(parent);
+			break;
+		}
+
 		dput(dentry);
 		dentry = parent;
-		spin_lock(&dentry->d_lock);
 	}
-	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
-
 /*
  * Make sure target_dir is fully connected to the dentry tree.
  *
-- 
cgit v1.2.3


From be9eee2e8b87e335531a3ae13abb8d26e834c438 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:29 -0400
Subject: smbfs: use dget_parent

Use dget_parent instead of opencoding it.  This simplifies the code, but
more importanly prepares for the more complicated locking for a parent
dget in the dcache scale patch series.

Note that the d_time assignment in smb_renew_times moves out of d_lock,
but it's a single atomic 32-bit value, and that's what other sites
setting it do already.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/smbfs/dir.c  | 16 +++++-----------
 fs/smbfs/proc.c | 10 +++-------
 2 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 00a70cab1f36..f678d421e541 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -406,21 +406,15 @@ void
 smb_renew_times(struct dentry * dentry)
 {
 	dget(dentry);
-	spin_lock(&dentry->d_lock);
-	for (;;) {
-		struct dentry *parent;
+	dentry->d_time = jiffies;
 
-		dentry->d_time = jiffies;
-		if (IS_ROOT(dentry))
-			break;
-		parent = dentry->d_parent;
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dget_parent(dentry);
 		dput(dentry);
 		dentry = parent;
-		spin_lock(&dentry->d_lock);
+
+		dentry->d_time = jiffies;
 	}
-	spin_unlock(&dentry->d_lock);
 	dput(dentry);
 }
 
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 71c29b6670b4..3dcf638d4d3a 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -332,16 +332,15 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
 	 * and store it in reversed order [see reverse_string()]
 	 */
 	dget(entry);
-	spin_lock(&entry->d_lock);
 	while (!IS_ROOT(entry)) {
 		struct dentry *parent;
 
 		if (maxlen < (3<<unicode)) {
-			spin_unlock(&entry->d_lock);
 			dput(entry);
 			return -ENAMETOOLONG;
 		}
 
+		spin_lock(&entry->d_lock);
 		len = server->ops->convert(path, maxlen-2, 
 				      entry->d_name.name, entry->d_name.len,
 				      server->local_nls, server->remote_nls);
@@ -359,15 +358,12 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
 		}
 		*path++ = '\\';
 		maxlen -= len+1;
-
-		parent = entry->d_parent;
-		dget(parent);
 		spin_unlock(&entry->d_lock);
+
+		parent = dget_parent(entry);
 		dput(entry);
 		entry = parent;
-		spin_lock(&entry->d_lock);
 	}
-	spin_unlock(&entry->d_lock);
 	dput(entry);
 	reverse_string(buf, path-buf);
 
-- 
cgit v1.2.3


From 4d4eb36679adbdd75495e1bbfe7ac40e4ae41dea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 10 Oct 2010 05:36:30 -0400
Subject: fsnotify: use dget_parent

Use dget_parent instead of opencoding it.  This simplifies the code, but
more importanly prepares for the more complicated locking for a parent
dget in the dcache scale patch series.

It means we do grab a reference to the parent now if need to be watched,
but not with the specified mask.  If this turns out to be a problem
we'll have to revisit it, but for now let's keep as much as possible
dcache internals inside dcache.[ch].

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/notify/fsnotify.c | 33 +++++----------------------------
 1 file changed, 5 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..4498a208df94 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -88,8 +88,6 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
-	bool send = false;
-	bool should_update_children = false;
 
 	if (!dentry)
 		dentry = path->dentry;
@@ -97,29 +95,12 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		return;
 
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
+	parent = dget_parent(dentry);
 	p_inode = parent->d_inode;
 
-	if (fsnotify_inode_watches_children(p_inode)) {
-		if (p_inode->i_fsnotify_mask & mask) {
-			dget(parent);
-			send = true;
-		}
-	} else {
-		/*
-		 * The parent doesn't care about events on it's children but
-		 * at least one child thought it did.  We need to run all the
-		 * children and update their d_flags to let them know p_inode
-		 * doesn't care about them any more.
-		 */
-		dget(parent);
-		should_update_children = true;
-	}
-
-	spin_unlock(&dentry->d_lock);
-
-	if (send) {
+	if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+		__fsnotify_update_child_dentry_flags(p_inode);
+	else if (p_inode->i_fsnotify_mask & mask) {
 		/* we are notifying a parent so come up with the new mask which
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
@@ -130,13 +111,9 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 		else
 			fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
 				 dentry->d_name.name, 0);
-		dput(parent);
 	}
 
-	if (unlikely(should_update_children)) {
-		__fsnotify_update_child_dentry_flags(p_inode);
-		dput(parent);
-	}
+	dput(parent);
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
-- 
cgit v1.2.3


From 99a38919241fd051b8d93b2e4d0c05ef0556d795 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Oct 2010 19:07:20 +0200
Subject: fs: fix buffer invalidation in invalidate_list

We must not call invalidate_inode_buffers in invalidate_list unless the
inode can be reclaimed.  If we remove the buffer association of a busy
inode fsync won't find the buffers anymore.  As invalidate_inode_buffers
is called from various others sources than umount this actually does
matter in practice.

While at it change the loop to a more natural form and remove the
WARN_ON for I_NEW, wich we already tested a few lines above.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 2cd2e48f7a20..4bedac32154f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,7 +28,6 @@
 /*
  * This is needed for the following functions:
  *  - inode_has_buffers
- *  - invalidate_inode_buffers
  *  - invalidate_bdev
  *
  * FIXME: remove all knowledge of the buffer layer from this file
@@ -503,16 +502,15 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 		inode = list_entry(tmp, struct inode, i_sb_list);
 		if (inode->i_state & I_NEW)
 			continue;
-		invalidate_inode_buffers(inode);
-		if (!atomic_read(&inode->i_count)) {
-			list_move(&inode->i_list, dispose);
-			WARN_ON(inode->i_state & I_NEW);
-			inode->i_state |= I_FREEING;
-			if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-				percpu_counter_dec(&nr_inodes_unused);
+		if (atomic_read(&inode->i_count)) {
+			busy = 1;
 			continue;
 		}
-		busy = 1;
+
+		list_move(&inode->i_list, dispose);
+		inode->i_state |= I_FREEING;
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			percpu_counter_dec(&nr_inodes_unused);
 	}
 	return busy;
 }
-- 
cgit v1.2.3


From a5491e0c7bb7387e3e6ff9994d6dc2efc78af56c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 21 Oct 2010 11:49:26 +1100
Subject: fs: switch bdev inode bdi's correctly

bdev inodes can remain dirty even after their last close. Hence the
BDI associated with the bdev->inode gets modified duringthe last
close to point to the default BDI. However, the bdev inode still
needs to be moved to the dirty lists of the new BDI, otherwise it
will corrupt the writeback list is was left on.

Add a new function bdev_inode_switch_bdi() to move all the bdi state
from the old bdi to the new one safely. This is only a temporary
measure until the bdev inode<->bdi lifecycle problems are sorted
out.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 81972eb34b39..4e847e53051f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
 
 EXPORT_SYMBOL(I_BDEV);
 
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+			struct backing_dev_info *dst)
+{
+	spin_lock(&inode_lock);
+	inode->i_data.backing_dev_info = dst;
+	if (inode->i_state & I_DIRTY)
+		list_move(&inode->i_list, &dst->wb.b_dirty);
+	spin_unlock(&inode_lock);
+}
+
 static sector_t max_block(struct block_device *bdev)
 {
 	sector_t retval = ~((sector_t)0);
@@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				bdi = blk_get_backing_dev_info(bdev);
 				if (bdi == NULL)
 					bdi = &default_backing_dev_info;
-				bdev->bd_inode->i_data.backing_dev_info = bdi;
+				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
-			bdev->bd_inode->i_data.backing_dev_info =
-			   whole->bd_inode->i_data.backing_dev_info;
+			bdev_inode_switch_bdi(bdev->bd_inode,
+				whole->bd_inode->i_data.backing_dev_info);
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
-	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
-		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+		bdev_inode_switch_bdi(bdev->bd_inode,
+					&default_backing_dev_info);
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
-- 
cgit v1.2.3


From 7ccf19a8042e343f8159f8a5fdd6a9422aa90c78 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 21 Oct 2010 11:49:30 +1100
Subject: fs: inode split IO and LRU lists

The use of the same inode list structure (inode->i_list) for two
different list constructs with different lifecycles and purposes
makes it impossible to separate the locking of the different
operations. Therefore, to enable the separation of the locking of
the writeback and reclaim lists, split the inode->i_list into two
separate lists dedicated to their specific tracking functions.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     |  2 +-
 fs/fs-writeback.c  | 35 ++++++++++++++++++-----------------
 fs/inode.c         | 53 ++++++++++++++++++++++++++++++++++-------------------
 include/linux/fs.h |  3 ++-
 mm/backing-dev.c   |  6 +++---
 5 files changed, 58 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4e847e53051f..dea3b628a6ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -59,7 +59,7 @@ static void bdev_inode_switch_bdi(struct inode *inode,
 	spin_lock(&inode_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
-		list_move(&inode->i_list, &dst->wb.b_dirty);
+		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	spin_unlock(&inode_lock);
 }
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e8f65290e836..7a24cc957f05 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 	return sb->s_bdi;
 }
 
+static inline struct inode *wb_inode(struct list_head *head)
+{
+	return list_entry(head, struct inode, i_wb_list);
+}
+
 static void bdi_queue_work(struct backing_dev_info *bdi,
 		struct wb_writeback_work *work)
 {
@@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode)
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
-		tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+		tail = wb_inode(wb->b_dirty.next);
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_list, &wb->b_dirty);
+	list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 
 /*
@@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode)
 {
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
-	list_move(&inode->i_list, &wb->b_more_io);
+	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 
 	while (!list_empty(delaying_queue)) {
-		inode = list_entry(delaying_queue->prev, struct inode, i_list);
+		inode = wb_inode(delaying_queue->prev);
 		if (older_than_this &&
 		    inode_dirtied_after(inode, *older_than_this))
 			break;
 		if (sb && sb != inode->i_sb)
 			do_sb_sort = 1;
 		sb = inode->i_sb;
-		list_move(&inode->i_list, &tmp);
+		list_move(&inode->i_wb_list, &tmp);
 	}
 
 	/* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 
 	/* Move inodes from one superblock together */
 	while (!list_empty(&tmp)) {
-		inode = list_entry(tmp.prev, struct inode, i_list);
-		sb = inode->i_sb;
+		sb = wb_inode(tmp.prev)->i_sb;
 		list_for_each_prev_safe(pos, node, &tmp) {
-			inode = list_entry(pos, struct inode, i_list);
+			inode = wb_inode(pos);
 			if (inode->i_sb == sb)
-				list_move(&inode->i_list, dispatch_queue);
+				list_move(&inode->i_wb_list, dispatch_queue);
 		}
 	}
 }
@@ -414,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			 * a reference to the inode or it's on it's way out.
 			 * No need to add it back to the LRU.
 			 */
-			list_del_init(&inode->i_list);
+			list_del_init(&inode->i_wb_list);
 		}
 	}
 	inode_sync_complete(inode);
@@ -462,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
 	while (!list_empty(&wb->b_io)) {
 		long pages_skipped;
-		struct inode *inode = list_entry(wb->b_io.prev,
-						 struct inode, i_list);
+		struct inode *inode = wb_inode(wb->b_io.prev);
 
 		if (inode->i_sb != sb) {
 			if (only_this_sb) {
@@ -533,8 +536,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 		queue_io(wb, wbc->older_than_this);
 
 	while (!list_empty(&wb->b_io)) {
-		struct inode *inode = list_entry(wb->b_io.prev,
-						 struct inode, i_list);
+		struct inode *inode = wb_inode(wb->b_io.prev);
 		struct super_block *sb = inode->i_sb;
 
 		if (!pin_sb_for_writeback(sb)) {
@@ -672,8 +674,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 */
 		spin_lock(&inode_lock);
 		if (!list_empty(&wb->b_more_io))  {
-			inode = list_entry(wb->b_more_io.prev,
-						struct inode, i_list);
+			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			inode_wait_for_writeback(inode);
 		}
@@ -987,7 +988,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &bdi->wb.b_dirty);
+			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
 		}
 	}
 out:
diff --git a/fs/inode.c b/fs/inode.c
index 4bedac32154f..09e2d7a5f1d2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -71,7 +71,7 @@ static unsigned int i_hash_shift __read_mostly;
  * allowing for low-overhead inode sync() operations.
  */
 
-static LIST_HEAD(inode_unused);
+static LIST_HEAD(inode_lru);
 static struct hlist_head *inode_hashtable __read_mostly;
 
 /*
@@ -271,6 +271,7 @@ EXPORT_SYMBOL(__destroy_inode);
 
 static void destroy_inode(struct inode *inode)
 {
+	BUG_ON(!list_empty(&inode->i_lru));
 	__destroy_inode(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
@@ -289,7 +290,8 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
-	INIT_LIST_HEAD(&inode->i_list);
+	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_lru);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -330,16 +332,16 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
-	if (list_empty(&inode->i_list)) {
-		list_add(&inode->i_list, &inode_unused);
+	if (list_empty(&inode->i_lru)) {
+		list_add(&inode->i_lru, &inode_lru);
 		percpu_counter_inc(&nr_inodes_unused);
 	}
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
-	if (!list_empty(&inode->i_list)) {
-		list_del_init(&inode->i_list);
+	if (!list_empty(&inode->i_lru)) {
+		list_del_init(&inode->i_lru);
 		percpu_counter_dec(&nr_inodes_unused);
 	}
 }
@@ -460,8 +462,8 @@ static void dispose_list(struct list_head *head)
 	while (!list_empty(head)) {
 		struct inode *inode;
 
-		inode = list_first_entry(head, struct inode, i_list);
-		list_del_init(&inode->i_list);
+		inode = list_first_entry(head, struct inode, i_lru);
+		list_del_init(&inode->i_lru);
 
 		evict(inode);
 
@@ -507,8 +509,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 			continue;
 		}
 
-		list_move(&inode->i_list, dispose);
 		inode->i_state |= I_FREEING;
+
+		/*
+		 * Move the inode off the IO lists and LRU once I_FREEING is
+		 * set so that it won't get moved back on there if it is dirty.
+		 */
+		list_move(&inode->i_lru, dispose);
+		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
 			percpu_counter_dec(&nr_inodes_unused);
 	}
@@ -580,10 +588,10 @@ static void prune_icache(int nr_to_scan)
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
 
-		if (list_empty(&inode_unused))
+		if (list_empty(&inode_lru))
 			break;
 
-		inode = list_entry(inode_unused.prev, struct inode, i_list);
+		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 
 		/*
 		 * Referenced or dirty inodes are still in use. Give them
@@ -591,14 +599,14 @@ static void prune_icache(int nr_to_scan)
 		 */
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
-			list_del_init(&inode->i_list);
+			list_del_init(&inode->i_lru);
 			percpu_counter_dec(&nr_inodes_unused);
 			continue;
 		}
 
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
-			list_move(&inode->i_list, &inode_unused);
+			list_move(&inode->i_lru, &inode_lru);
 			inode->i_state &= ~I_REFERENCED;
 			continue;
 		}
@@ -611,15 +619,21 @@ static void prune_icache(int nr_to_scan)
 			iput(inode);
 			spin_lock(&inode_lock);
 
-			if (inode != list_entry(inode_unused.next,
-						struct inode, i_list))
+			if (inode != list_entry(inode_lru.next,
+						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
 			if (!can_unuse(inode))
 				continue;
 		}
-		list_move(&inode->i_list, &freeable);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
+
+		/*
+		 * Move the inode off the IO lists and LRU once I_FREEING is
+		 * set so that it won't get moved back on there if it is dirty.
+		 */
+		list_move(&inode->i_lru, &freeable);
+		list_del_init(&inode->i_wb_list);
 		percpu_counter_dec(&nr_inodes_unused);
 	}
 	if (current_is_kswapd())
@@ -1340,15 +1354,16 @@ static void iput_final(struct inode *inode)
 		inode->i_state &= ~I_WILL_FREE;
 		__remove_inode_hash(inode);
 	}
+
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
 
 	/*
-	 * After we delete the inode from the LRU here, we avoid moving dirty
-	 * inodes back onto the LRU now because I_FREEING is set and hence
-	 * writeback_single_inode() won't move the inode around.
+	 * Move the inode off the IO lists and LRU once I_FREEING is
+	 * set so that it won't get moved back on there if it is dirty.
 	 */
 	inode_lru_list_del(inode);
+	list_del_init(&inode->i_wb_list);
 
 	__inode_sb_list_del(inode);
 	spin_unlock(&inode_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d58059944801..f300a6508818 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -723,7 +723,8 @@ struct posix_acl;
 
 struct inode {
 	struct hlist_node	i_hash;
-	struct list_head	i_list;		/* backing dev IO list */
+	struct list_head	i_wb_list;	/* backing dev IO list */
+	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
 	struct list_head	i_dentry;
 	unsigned long		i_ino;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..15d5097de821 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 
 	nr_wb = nr_dirty = nr_io = nr_more_io = 0;
 	spin_lock(&inode_lock);
-	list_for_each_entry(inode, &wb->b_dirty, i_list)
+	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
-	list_for_each_entry(inode, &wb->b_io, i_list)
+	list_for_each_entry(inode, &wb->b_io, i_wb_list)
 		nr_io++;
-	list_for_each_entry(inode, &wb->b_more_io, i_list)
+	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
 	spin_unlock(&inode_lock);
 
-- 
cgit v1.2.3


From d895a1c96af8c2a0f6a5e0119695a7c6b92df8db Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Oct 2010 19:40:24 +0200
Subject: fs: do not drop inode_lock in dispose_list

Despite the comment above it we can not safely drop the lock here.
invalidate_list is called from many other places that just umount.
Also switch to proper list macros now that we never drop the lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 09e2d7a5f1d2..1bf2be41257a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,26 +482,10 @@ static void dispose_list(struct list_head *head)
  */
 static int invalidate_list(struct list_head *head, struct list_head *dispose)
 {
-	struct list_head *next;
+	struct inode *inode, *next;
 	int busy = 0;
 
-	next = head->next;
-	for (;;) {
-		struct list_head *tmp = next;
-		struct inode *inode;
-
-		/*
-		 * We can reschedule here without worrying about the list's
-		 * consistency because the per-sb list of inodes must not
-		 * change during umount anymore, and because iprune_sem keeps
-		 * shrink_icache_memory() away.
-		 */
-		cond_resched_lock(&inode_lock);
-
-		next = next->next;
-		if (tmp == head)
-			break;
-		inode = list_entry(tmp, struct inode, i_sb_list);
+	list_for_each_entry_safe(inode, next, head, i_sb_list) {
 		if (inode->i_state & I_NEW)
 			continue;
 		if (atomic_read(&inode->i_count)) {
-- 
cgit v1.2.3


From a031878670ac8fe466859d4c1506bd91ae48678c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Oct 2010 19:40:33 +0200
Subject: fs: fold invalidate_list into invalidate_inodes

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 43 ++++++++++++++++---------------------------
 1 file changed, 16 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 1bf2be41257a..f6b3b52f1278 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -477,15 +477,24 @@ static void dispose_list(struct list_head *head)
 	}
 }
 
-/*
- * Invalidate all inodes for a device.
+/**
+ * invalidate_inodes	- attempt to free all inodes on a superblock
+ * @sb:		superblock to operate on
+ *
+ * Attempts to free all inodes for a given superblock.  If there were any
+ * busy inodes return a non-zero value, else zero.
  */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+int invalidate_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next;
 	int busy = 0;
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
 
-	list_for_each_entry_safe(inode, next, head, i_sb_list) {
+	down_write(&iprune_sem);
+
+	spin_lock(&inode_lock);
+	fsnotify_unmount_inodes(&sb->s_inodes);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & I_NEW)
 			continue;
 		if (atomic_read(&inode->i_count)) {
@@ -499,34 +508,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 		 * Move the inode off the IO lists and LRU once I_FREEING is
 		 * set so that it won't get moved back on there if it is dirty.
 		 */
-		list_move(&inode->i_lru, dispose);
+		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
 			percpu_counter_dec(&nr_inodes_unused);
 	}
-	return busy;
-}
-
-/**
- *	invalidate_inodes	- discard the inodes on a device
- *	@sb: superblock
- *
- *	Discard all of the inodes for a given superblock. If the discard
- *	fails because there are busy inodes then a non zero value is returned.
- *	If the discard is successful all the inodes have been discarded.
- */
-int invalidate_inodes(struct super_block *sb)
-{
-	int busy;
-	LIST_HEAD(throw_away);
-
-	down_write(&iprune_sem);
-	spin_lock(&inode_lock);
-	fsnotify_unmount_inodes(&sb->s_inodes);
-	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
-	dispose_list(&throw_away);
+	dispose_list(&dispose);
 	up_write(&iprune_sem);
 
 	return busy;
-- 
cgit v1.2.3


From 9843b76aae80293f5b5a0e275360627508595ce5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Oct 2010 19:40:46 +0200
Subject: fs: skip I_FREEING inodes in writeback_sb_inodes

Skip I_FREEING inodes just like I_WILL_FREE and I_NEW when walking the
writeback lists.  Currenly this can't happen, but once we move from
inode_lock to more fine grained locking we can have an inode that's
still on the writeback lists but has I_FREEING set, and we absolutely
need to skip it here, just like we do for all other inode list walks.

Based on a patch from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7a24cc957f05..f6af81add459 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			return 0;
 		}
 
-		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+		/*
+		 * Don't bother with new inodes or inodes beeing freed, first
+		 * kind does not need peridic writeout yet, and for the latter
+		 * kind writeout is handled by the freer.
+		 */
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
+
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
@@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		if (inode_dirtied_after(inode, wbc->wb_start))
 			return 1;
 
-		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
-- 
cgit v1.2.3


From 63997e98a3be68d7cec806d22bf9b02b2e1daabb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 25 Oct 2010 20:49:35 -0400
Subject: split invalidate_inodes()

Pull removal of fsnotify marks into generic_shutdown_super().
Split umount-time work into a new function - evict_inodes().
Make sure that invalidate_inodes() will be able to cope with
I_FREEING once we change locking in iput().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/internal.h          |  1 +
 fs/notify/inode_mark.c |  2 ++
 fs/super.c             |  8 ++++----
 4 files changed, 51 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index f6b3b52f1278..a6d60682f0fd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -477,6 +477,49 @@ static void dispose_list(struct list_head *head)
 	}
 }
 
+/**
+ * evict_inodes	- evict all evictable inodes for a superblock
+ * @sb:		superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
+ */
+void evict_inodes(struct super_block *sb)
+{
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
+
+	down_write(&iprune_sem);
+
+	spin_lock(&inode_lock);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		if (atomic_read(&inode->i_count))
+			continue;
+
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			WARN_ON(1);
+			continue;
+		}
+
+		inode->i_state |= I_FREEING;
+
+		/*
+		 * Move the inode off the IO lists and LRU once I_FREEING is
+		 * set so that it won't get moved back on there if it is dirty.
+		 */
+		list_move(&inode->i_lru, &dispose);
+		list_del_init(&inode->i_wb_list);
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			percpu_counter_dec(&nr_inodes_unused);
+	}
+	spin_unlock(&inode_lock);
+
+	dispose_list(&dispose);
+	up_write(&iprune_sem);
+}
+
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
  * @sb:		superblock to operate on
@@ -493,9 +536,8 @@ int invalidate_inodes(struct super_block *sb)
 	down_write(&iprune_sem);
 
 	spin_lock(&inode_lock);
-	fsnotify_unmount_inodes(&sb->s_inodes);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & I_NEW)
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
 			continue;
 		if (atomic_read(&inode->i_count)) {
 			busy = 1;
diff --git a/fs/internal.h b/fs/internal.h
index 4cc67eb6ed56..ebad3b90752d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,4 +106,5 @@ extern void release_open_intent(struct nameidata *);
  * inode.c
  */
 extern int get_nr_dirty_inodes(void);
+extern int evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..21ed10660b80 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -240,6 +240,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 {
 	struct inode *inode, *next_i, *need_iput = NULL;
 
+	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 		struct inode *need_iput_tmp;
 
@@ -297,4 +298,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
 
 		spin_lock(&inode_lock);
 	}
+	spin_unlock(&inode_lock);
 }
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..b9c9869165db 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
 		get_fs_excl();
 		sb->s_flags &= ~MS_ACTIVE;
 
-		/* bad name - it should be evict_inodes() */
-		invalidate_inodes(sb);
+		fsnotify_unmount_inodes(&sb->s_inodes);
+
+		evict_inodes(sb);
 
 		if (sop->put_super)
 			sop->put_super(sb);
 
-		/* Forget any remaining inodes */
-		if (invalidate_inodes(sb)) {
+		if (!list_empty(&sb->s_inodes)) {
 			printk("VFS: Busy inodes after unmount of %s. "
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
-- 
cgit v1.2.3


From 43c2e885be25311e6289c7da52e8a03c4453ee03 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 2 Oct 2010 15:19:01 -0400
Subject: nfs4: fix channel attribute sanity-checks

The sanity checks here are incorrect; in the worst case they allow
values that crash the client.

They're also over-reliant on the preprocessor.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 75 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7e14e991ddfa..32c8758c99fd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4842,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->bc_attrs.max_reqs);
 }
 
-static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
 {
-	if (rcvd <= sent)
-		return 0;
-	printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
-		"sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
-	return -EINVAL;
+	struct nfs4_channel_attrs *sent = &args->fc_attrs;
+	struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+
+	if (rcvd->headerpadsz > sent->headerpadsz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz > sent->max_resp_sz)
+		return -EINVAL;
+	/*
+	 * Our requested max_ops is the minimum we need; we're not
+	 * prepared to break up compounds into smaller pieces than that.
+	 * So, no point even trying to continue if the server won't
+	 * cooperate:
+	 */
+	if (rcvd->max_ops < sent->max_ops)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	return 0;
 }
 
-#define _verify_fore_channel_attr(_name_) \
-	_verify_channel_attr("fore", #_name_, \
-			     args->fc_attrs._name_, \
-			     session->fc_attrs._name_)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+{
+	struct nfs4_channel_attrs *sent = &args->bc_attrs;
+	struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
 
-#define _verify_back_channel_attr(_name_) \
-	_verify_channel_attr("back", #_name_, \
-			     args->bc_attrs._name_, \
-			     session->bc_attrs._name_)
+	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz < sent->max_resp_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+		return -EINVAL;
+	/* These would render the backchannel useless: */
+	if (rcvd->max_ops  == 0)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	return 0;
+}
 
-/*
- * The server is not allowed to increase the fore channel header pad size,
- * maximum response size, or maximum number of operations.
- *
- * The back channel attributes are only negotiatied down: We send what the
- * (back channel) server insists upon.
- */
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
 				     struct nfs4_session *session)
 {
-	int ret = 0;
-
-	ret |= _verify_fore_channel_attr(headerpadsz);
-	ret |= _verify_fore_channel_attr(max_resp_sz);
-	ret |= _verify_fore_channel_attr(max_ops);
-
-	ret |= _verify_back_channel_attr(headerpadsz);
-	ret |= _verify_back_channel_attr(max_rqst_sz);
-	ret |= _verify_back_channel_attr(max_resp_sz);
-	ret |= _verify_back_channel_attr(max_resp_sz_cached);
-	ret |= _verify_back_channel_attr(max_ops);
-	ret |= _verify_back_channel_attr(max_reqs);
+	int ret;
 
-	return ret;
+	ret = nfs4_verify_fore_channel_attrs(args, session);
+	if (ret)
+		return ret;
+	return nfs4_verify_back_channel_attrs(args, session);
 }
 
 static int _nfs4_proc_create_session(struct nfs_client *clp)
-- 
cgit v1.2.3


From 382279336f428c80f344edfc30d53797e3e76146 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 26 Oct 2010 12:52:53 -0400
Subject: Btrfs: set trans to null in reserve_metadata_bytes if we commit the
 transaction

btrfs_commit_transaction will free our trans, but because we pass trans to
shrink_delalloc we could possibly have a use after free situation.  So instead
if we commit the transaction, set trans to null and set committed to true so we
don't keep trying to commit a transaction.  This fixes a panic I could reproduce
at will.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 180a50146ddf..e2dfd4ab3b9b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3157,6 +3157,7 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 	int retries = 0;
 	int ret = 0;
 	bool reserved = false;
+	bool committed = false;
 
 again:
 	ret = -ENOSPC;
@@ -3249,17 +3250,19 @@ again:
 		goto out;
 
 	ret = -EAGAIN;
-	if (trans)
+	if (trans || committed)
 		goto out;
 
-
 	ret = -ENOSPC;
 	trans = btrfs_join_transaction(root, 1);
 	if (IS_ERR(trans))
 		goto out;
 	ret = btrfs_commit_transaction(trans, root);
-	if (!ret)
+	if (!ret) {
+		trans = NULL;
+		committed = true;
 		goto again;
+	}
 
 out:
 	if (reserved) {
-- 
cgit v1.2.3


From e9bb7f10d3617304ef94ff7aa8fefbce3078f08b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 26 Oct 2010 12:55:03 -0400
Subject: Btrfs: remove warn_on from use_block_rsv

Because btrfs_dirty_inode does a btrfs_join_transaction, it doesn't actually
reserve space.  It does this so we can try and dirty the inode quickly without
having to deal with the ENOSPC problems.  But if it does get back ENOSPC it
handles it properly.  The problem is use_block_rsv does a WARN_ON whenever this
case happens, even tho btrfs_dirty_inode takes it into account and actually
expects to get -ENOSPC if things are particularly tight.  So instead just remove
the warning.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2dfd4ab3b9b..33785333f293 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5358,11 +5358,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return block_rsv;
 
-	WARN_ON(1);
-	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-		block_rsv->size, block_rsv->reserved,
-		block_rsv->freed[0], block_rsv->freed[1]);
-
 	return ERR_PTR(-ENOSPC);
 }
 
-- 
cgit v1.2.3


From 036a1075978e35811f22be3ff86a70cb8d22cb85 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 26 Oct 2010 13:47:04 -0400
Subject: NFS: Fix a compile issue in nfs_root

Stephen Rothwell reports:

> /home/test/linux-2.6/fs/nfs/nfsroot.c: In function 'nfs_root_debug':
> /home/test/linux-2.6/fs/nfs/nfsroot.c:110:2: error: 'nfs_debug'
> undeclared (first use in this function)
> /home/test/linux-2.6/fs/nfs/nfsroot.c:110:2: note: each undeclared
> identifier is reported only once for each function it appears in
> make[3]: *** [fs/nfs/nfsroot.o] Error 1
> make[2]: *** [fs/nfs] Error 2
> make[1]: *** [fs] Error 2
> make: *** [sub-make] Error 2

Which is caused by commit 306a075362a288683f6346185f97dd0e06df19da
(NFS: Allow NFSROOT debugging messages to be enabled dynamically)

Fix is to disable this code when RPC_DEBUG is disabled.

Reported-by: Zimny Lech <napohybelskurwysynom2010@gmail.com>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 460df3652889..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -101,6 +101,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
 /* server:export path string passed to super.c */
 static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
 
+#ifdef RPC_DEBUG
 /*
  * When the "nfsrootdebug" kernel command line option is specified,
  * enable debugging messages for NFSROOT.
@@ -112,6 +113,7 @@ static int __init nfs_root_debug(char *__unused)
 }
 
 __setup("nfsrootdebug", nfs_root_debug);
+#endif
 
 /*
  *  Parse NFS server and directory information passed on the kernel
-- 
cgit v1.2.3


From 21e733930be6458e0c33482b6783e7c15ba984eb Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 21 Oct 2010 06:42:55 -0500
Subject: NTLM auth and sign - Allocate session key/client response dynamically

Start calculating auth response within a session.  Move/Add pertinet
data structures like session key, server challenge and ntlmv2_hash in
a session structure.  We should do the calculations within a session
before copying session key and response over to server data
structures because a session setup can fail.

Only after a very first smb session succeeds, it copy/make its
session key, session key of smb connection.  This key stays with
the smb connection throughout its life.
sequence_number within server is set to 0x2.

The authentication Message Authentication Key (mak) which consists
of session key followed by client response within structure session_key
is now dynamic.  Every authentication type allocates the key + response
sized memory within its session structure and later either assigns or
frees it once the client response is sent and if session's session key
becomes connetion's session key.

ntlm/ntlmi authentication functions are rearranged.  A function
named setup_ntlm_resp(), similar to setup_ntlmv2_resp(), replaces
function cifs_calculate_session_key().

size of CIFS_SESS_KEY_SIZE is changed to 16, to reflect the byte size
of the key it holds.

Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 113 +++++++++++++++++++++++++++++---------------------
 fs/cifs/cifsglob.h    |   9 +---
 fs/cifs/cifspdu.h     |   7 +++-
 fs/cifs/cifsproto.h   |  11 +++--
 fs/cifs/connect.c     |  18 ++++++--
 fs/cifs/sess.c        | 107 +++++++++++++++++------------------------------
 fs/cifs/transport.c   |   6 +--
 7 files changed, 134 insertions(+), 137 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7ac0056294cf..987b479d55dd 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -43,15 +43,16 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 		       unsigned char *p24);
 
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-				const struct session_key *key, char *signature)
+				struct TCP_Server_Info *server, char *signature)
 {
 	struct	MD5Context context;
 
-	if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+	if (cifs_pdu == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
 	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, (char *)&key->data, key->len);
+	cifs_MD5_update(&context, server->session_key.response,
+			server->session_key.len);
 	cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
 
 	cifs_MD5_final(signature, &context);
@@ -79,8 +80,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calculate_signature(cifs_pdu, &server->session_key,
-				      smb_signature);
+	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -90,16 +90,17 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-				const struct session_key *key, char *signature)
+				struct TCP_Server_Info *server, char *signature)
 {
 	struct  MD5Context context;
 	int i;
 
-	if ((iov == NULL) || (signature == NULL) || (key == NULL))
+	if (iov == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
 	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, (char *)&key->data, key->len);
+	cifs_MD5_update(&context, server->session_key.response,
+				server->session_key.len);
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
@@ -146,8 +147,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calc_signature2(iov, n_vec, &server->session_key,
-				      smb_signature);
+	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -157,14 +157,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-			  const struct session_key *session_key,
+			  struct TCP_Server_Info *server,
 			  __u32 expected_sequence_number)
 {
 	unsigned int rc;
 	char server_response_sig[8];
 	char what_we_think_sig_should_be[20];
 
-	if (cifs_pdu == NULL || session_key == NULL)
+	if (cifs_pdu == NULL || server == NULL)
 		return -EINVAL;
 
 	if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -193,7 +193,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 					cpu_to_le32(expected_sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	rc = cifs_calculate_signature(cifs_pdu, session_key,
+	rc = cifs_calculate_signature(cifs_pdu, server,
 		what_we_think_sig_should_be);
 
 	if (rc)
@@ -209,18 +209,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 
 }
 
-/* We fill in key by putting in 40 byte array which was allocated by caller */
-int cifs_calculate_session_key(struct session_key *key, const char *rn,
-			   const char *password)
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
+int setup_ntlm_response(struct cifsSesInfo *ses)
 {
-	char temp_key[16];
-	if ((key == NULL) || (rn == NULL))
+	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
+	char temp_key[CIFS_SESS_KEY_SIZE];
+
+	if (!ses)
 		return -EINVAL;
 
-	E_md4hash(password, temp_key);
-	mdfour(key->data.ntlm, temp_key, 16);
-	memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE);
-	key->len = 40;
+	ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
+		return -ENOMEM;
+	}
+	ses->auth_key.len = temp_len;
+
+	SMBNTencrypt(ses->password, ses->cryptKey,
+			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+
+	E_md4hash(ses->password, temp_key);
+	mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+
 	return 0;
 }
 
@@ -465,19 +475,13 @@ calc_exit_2:
 }
 
 int
-setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
-		      const struct nls_table *nls_cp)
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
 	int rc;
-	struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
+	int baselen;
+	struct ntlmv2_resp *buf;
 	struct HMACMD5Context context;
 
-	buf->blob_signature = cpu_to_le32(0x00000101);
-	buf->reserved = 0;
-	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
-	buf->reserved2 = 0;
-
 	if (ses->server->secType == RawNTLMSSP) {
 		if (!ses->domainName) {
 			rc = find_domain_name(ses);
@@ -494,22 +498,38 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 		}
 	}
 
+	baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+	ses->auth_key.len = baselen + ses->tilen;
+	ses->auth_key.response = kmalloc(ses->auth_key.len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		rc = ENOMEM;
+		cERROR(1, "%s: Can't allocate auth blob", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	buf = (struct ntlmv2_resp *)
+			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	buf->blob_signature = cpu_to_le32(0x00000101);
+	buf->reserved = 0;
+	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
+	buf->reserved2 = 0;
+
+	memcpy(ses->auth_key.response + baselen, ses->tiblob, ses->tilen);
+
 	/* calculate buf->ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, nls_cp);
 	if (rc) {
 		cERROR(1, "could not get v2 hash rc %d", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
-	CalcNTLMv2_response(ses, resp_buf);
+	CalcNTLMv2_response(ses);
 
 	/* now calculate the session key for NTLMv2 */
 	hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
-	hmac_md5_update(resp_buf, 16, &context);
-	hmac_md5_final(ses->auth_key.data.ntlmv2.key, &context);
-
-	memcpy(&ses->auth_key.data.ntlmv2.resp, resp_buf,
-	       sizeof(struct ntlmv2_resp));
-	ses->auth_key.len = 16 + sizeof(struct ntlmv2_resp);
+	hmac_md5_update(ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				16, &context);
+	hmac_md5_final(ses->auth_key.response, &context);
 
 	return 0;
 
@@ -521,20 +541,17 @@ setup_ntlmv2_rsp_ret:
 	return rc;
 }
 
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
-			 char *v2_session_response)
+void CalcNTLMv2_response(const struct cifsSesInfo *ses)
 {
+	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
 	struct HMACMD5Context context;
+
 	/* rest of v2 struct already generated */
-	memcpy(v2_session_response + 8, ses->cryptKey, 8);
+	memcpy(ses->auth_key.response + offset, ses->cryptKey, 8);
 	hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
 
-	hmac_md5_update(v2_session_response+8,
-			sizeof(struct ntlmv2_resp) - 8, &context);
-
-	if (ses->tilen)
-		hmac_md5_update(ses->tiblob, ses->tilen, &context);
+	hmac_md5_update(ses->auth_key.response + offset,
+			ses->auth_key.len - offset, &context);
 
-	hmac_md5_final(v2_session_response, &context);
-/*	cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+	hmac_md5_final(ses->auth_key.response + CIFS_SESS_KEY_SIZE, &context);
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 539475a11c45..66f76b2d270b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -99,14 +99,7 @@ enum protocolEnum {
 
 struct session_key {
 	unsigned int len;
-	union {
-		char ntlm[CIFS_SESS_KEY_SIZE + 16];
-		char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
-		struct {
-			char key[16];
-			struct ntlmv2_resp resp;
-		} ntlmv2;
-	} data;
+	char *response;
 };
 
 struct cifs_cred {
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b0f4b5656d4c..a152cd6db99b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -130,10 +130,15 @@
  */
 #define CIFS_CRYPTO_KEY_SIZE (8)
 
+/*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+
 /*
  * Size of the session key (crypto key encrypted with the password
  */
-#define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (16)
 
 /*
  * Maximum user name length
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e593c40ba7ba..8c2d0cf1a62f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -362,13 +362,12 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-				 const struct session_key *session_key,
+				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
-				 const char *pass);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
-extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
-			     const struct nls_table *);
+extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int setup_ntlm_response(struct cifsSesInfo *);
+extern void CalcNTLMv2_response(const struct cifsSesInfo *);
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
 				bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7e73176acb58..c5807d39dced 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -175,6 +175,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	}
 	server->sequence_number = 0;
 	server->session_estab = false;
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
 
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
@@ -1562,6 +1565,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 
 	cifs_fscache_release_client_cookie(server);
 
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
+
 	task = xchg(&server->tsk, NULL);
 	if (task)
 		force_sig(SIGKILL, task);
@@ -3178,10 +3185,11 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 	} else {
 		mutex_lock(&ses->server->srv_mutex);
 		if (!server->session_estab) {
-			memcpy(&server->session_key.data,
-				&ses->auth_key.data, ses->auth_key.len);
+			server->session_key.response = ses->auth_key.response;
 			server->session_key.len = ses->auth_key.len;
-			ses->server->session_estab = true;
+			server->sequence_number = 0x2;
+			server->session_estab = true;
+			ses->auth_key.response = NULL;
 		}
 		mutex_unlock(&server->srv_mutex);
 
@@ -3192,6 +3200,10 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 		spin_unlock(&GlobalMid_Lock);
 	}
 
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	ses->auth_key.len = 0;
+
 	return rc;
 }
 
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 2a11efd96592..b2934683bd08 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
 
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-			 unsigned char *p24);
-
 /*
  * Checks if this is the first smb session to be reconnected after
  * the socket has been reestablished (so we know whether to use vc 0).
@@ -469,11 +466,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 				   const struct nls_table *nls_cp)
 {
 	int rc;
-	unsigned int size;
 	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
 	__u32 flags;
 	unsigned char *tmp;
-	struct ntlmv2_resp ntlmv2_response = {};
 
 	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
 	sec_blob->MessageType = NtLmAuthenticate;
@@ -497,25 +492,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->LmChallengeResponse.MaximumLength = 0;
 
 	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
+	rc = setup_ntlmv2_rsp(ses, nls_cp);
 	if (rc) {
 		cERROR(1, "Error %d during NTLMSSP authentication", rc);
 		goto setup_ntlmv2_ret;
 	}
-	size =  sizeof(struct ntlmv2_resp);
-	memcpy(tmp, (char *)&ntlmv2_response, size);
-	tmp += size;
-	if (ses->tilen > 0) {
-		memcpy(tmp, ses->tiblob, ses->tilen);
-		tmp += ses->tilen;
-	}
+	memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
 
-	sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + ses->tilen);
+	sec_blob->NtChallengeResponse.Length =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 	sec_blob->NtChallengeResponse.MaximumLength =
-				cpu_to_le16(size + ses->tilen);
-	kfree(ses->tiblob);
-	ses->tiblob = NULL;
-	ses->tilen = 0;
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 
 	if (ses->domainName == NULL) {
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -687,24 +676,27 @@ ssetup_ntlmssp_authenticate:
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
 	} else if (type == NTLM) {
-		char ntlm_session_key[CIFS_SESS_KEY_SIZE];
-
 		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 		pSMB->req_no_secext.CaseInsensitivePasswordLength =
-			cpu_to_le16(CIFS_SESS_KEY_SIZE);
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(CIFS_SESS_KEY_SIZE);
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+
+		/* calculate ntlm response and session key */
+		rc = setup_ntlm_response(ses);
+		if (rc) {
+			cERROR(1, "Error %d during NTLM authentication", rc);
+			goto ssetup_exit;
+		}
 
-		/* calculate session key */
-		SMBNTencrypt(ses->password, ses->cryptKey, ntlm_session_key);
+		/* copy ntlm response */
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 
-		cifs_calculate_session_key(&ses->auth_key,
-					ntlm_session_key, ses->password);
-		/* copy session key */
-		memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
-		memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
 			/* unicode strings must be word aligned */
 			if (iov[0].iov_len % 2) {
@@ -715,47 +707,26 @@ ssetup_ntlmssp_authenticate:
 		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 	} else if (type == NTLMv2) {
-		char *v2_sess_key =
-			kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
-
-		/* BB FIXME change all users of v2_sess_key to
-		   struct ntlmv2_resp */
-
-		if (v2_sess_key == NULL) {
-			rc = -ENOMEM;
-			goto ssetup_exit;
-		}
-
 		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 
 		/* LM2 password would be here if we supported it */
 		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-		/*	cpu_to_le16(LM2_SESS_KEY_SIZE); */
 
-		/* calculate session key */
-		rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+		/* calculate nlmv2 response and session key */
+		rc = setup_ntlmv2_rsp(ses, nls_cp);
 		if (rc) {
 			cERROR(1, "Error %d during NTLMv2 authentication", rc);
-			kfree(v2_sess_key);
 			goto ssetup_exit;
 		}
-		memcpy(bcc_ptr, (char *)v2_sess_key,
-				sizeof(struct ntlmv2_resp));
-		bcc_ptr += sizeof(struct ntlmv2_resp);
-		kfree(v2_sess_key);
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+		bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
 		/* set case sensitive password length after tilen may get
 		 * assigned, tilen is 0 otherwise.
 		 */
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(sizeof(struct ntlmv2_resp) + ses->tilen);
-		if (ses->tilen > 0) {
-			memcpy(bcc_ptr, ses->tiblob, ses->tilen);
-			bcc_ptr += ses->tilen;
-			/* we never did allocate ses->domainName to free */
-			kfree(ses->tiblob);
-			ses->tiblob = NULL;
-			ses->tilen = 0;
-		}
+			cpu_to_le16(ses->auth_key.len);
 
 		if (ses->capabilities & CAP_UNICODE) {
 			if (iov[0].iov_len % 2) {
@@ -768,6 +739,7 @@ ssetup_ntlmssp_authenticate:
 	} else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
 		struct cifs_spnego_msg *msg;
+
 		spnego_key = cifs_get_spnego_key(ses);
 		if (IS_ERR(spnego_key)) {
 			rc = PTR_ERR(spnego_key);
@@ -785,16 +757,17 @@ ssetup_ntlmssp_authenticate:
 			rc = -EKEYREJECTED;
 			goto ssetup_exit;
 		}
-		/* bail out if key is too long */
-		if (msg->sesskey_len >
-		    sizeof(ses->auth_key.data.krb5)) {
-			cERROR(1, "Kerberos signing key too long (%u bytes)",
-				msg->sesskey_len);
-			rc = -EOVERFLOW;
+
+		ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
+		if (!ses->auth_key.response) {
+			cERROR(1, "Kerberos can't allocate (%u bytes) memory",
+					msg->sesskey_len);
+			rc = -ENOMEM;
 			goto ssetup_exit;
 		}
+		memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
 		ses->auth_key.len = msg->sesskey_len;
-		memcpy(ses->auth_key.data.krb5, msg->data, msg->sesskey_len);
+
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -897,8 +870,6 @@ ssetup_ntlmssp_authenticate:
 			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
-	cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
-
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 	smb_buf = (struct smb_hdr *)iov[0].iov_base;
 
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index a66c91eb6eb4..e0588cdf4cc5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(midQ->resp_buf,
-						&ses->server->session_key,
+						ses->server,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(out_buf,
-						&ses->server->session_key,
+						ses->server,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 				     SECMODE_SIGN_ENABLED))) {
 		rc = cifs_verify_signature(out_buf,
-					   &ses->server->session_key,
+					   ses->server,
 					   midQ->sequence_number+1);
 		if (rc) {
 			cERROR(1, "Unexpected SMB signature");
-- 
cgit v1.2.3


From b235f371a2572d7c86a121d96d889eee02ed00e2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 25 Oct 2010 07:02:56 +0200
Subject: cifs: cifs_convert_address() returns zero on error

The cifs_convert_address() returns zero on error but this caller is
testing for negative returns.

Btw. "i" is unsigned here, so it's never negative.

Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c5807d39dced..dd9a4ae1d21d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1067,7 +1067,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			}
 			i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
 						 value, strlen(value));
-			if (i < 0) {
+			if (i == 0) {
 				printk(KERN_WARNING "CIFS:  Could not parse"
 				       " srcaddr: %s\n",
 				       value);
-- 
cgit v1.2.3


From d2b915210b5ec01409f581421d633eca6c38d444 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 21 Oct 2010 14:25:08 -0500
Subject: NTLM auth and sign - Define crypto hash functions and create and send
 keys needed for key exchange

Mark dependency on crypto modules in Kconfig.

Defining per structures sdesc and cifs_secmech which are used to store
crypto hash functions and contexts.  They are stored per smb connection
and used for all auth mechs to genereate hash values and signatures.

Allocate crypto hashing functions, security descriptiors, and respective
contexts when a smb/tcp connection is established.
Release them when a tcp/smb connection is taken down.

md5 and hmac-md5 are two crypto hashing functions that are used
throught the life of an smb/tcp connection by various functions that
calcualte signagure and ntlmv2 hash, HMAC etc.

structure ntlmssp_auth is defined as per smb connection.

ntlmssp_auth holds ciphertext which is genereated by rc4/arc4 encryption of
secondary key, a nonce using ntlmv2 session key and sent in the session key
field of the type 3 message sent by the client during ntlmssp
negotiation/exchange

A key is exchanged with the server if client indicates so in flags in
type 1 messsage and server agrees in flag in type 2 message of ntlmssp
negotiation.  If both client and agree, a key sent by client in
type 3 message of ntlmssp negotiation in the session key field.
The key is a ciphertext generated off of secondary key, a nonce, using
ntlmv2 hash via rc4/arc4.

Signing works for ntlmssp in this patch. The sequence number within
the server structure needs to be zero until session is established
i.e. till type 3 packet of ntlmssp exchange of a to be very first
smb session on that smb connection is sent.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Kconfig       |   3 ++
 fs/cifs/cifsencrypt.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifsglob.h    |  26 ++++++++++++
 fs/cifs/cifspdu.h     |   6 +++
 fs/cifs/cifsproto.h   |   4 ++
 fs/cifs/connect.c     |  16 +++++--
 fs/cifs/sess.c        |  26 ++++++++----
 7 files changed, 184 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..0ed213970ced 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,9 @@ config CIFS
 	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
 	depends on INET
 	select NLS
+	select CRYPTO
+	select CRYPTO_MD5
+	select CRYPTO_ARC4
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 987b479d55dd..eaa2327ee7af 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -541,6 +541,119 @@ setup_ntlmv2_rsp_ret:
 	return rc;
 }
 
+int
+calc_seckey(struct cifsSesInfo *ses)
+{
+	int rc;
+	struct crypto_blkcipher *tfm_arc4;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+	unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+
+	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+
+	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+		cERROR(1, "could not allocate crypto API arc4\n");
+		return PTR_ERR(tfm_arc4);
+	}
+
+	desc.tfm = tfm_arc4;
+
+	crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+					CIFS_SESS_KEY_SIZE);
+
+	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
+	sg_init_one(&sgout, ses->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+	if (rc) {
+		cERROR(1, "could not encrypt session key rc: %d\n", rc);
+		crypto_free_blkcipher(tfm_arc4);
+		return rc;
+	}
+
+	/* make secondary_key/nonce as session key */
+	memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+	/* and make len as that of session key only */
+	ses->auth_key.len = CIFS_SESS_KEY_SIZE;
+
+	crypto_free_blkcipher(tfm_arc4);
+
+	return 0;
+}
+
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+	if (server->secmech.md5)
+		crypto_free_shash(server->secmech.md5);
+
+	if (server->secmech.hmacmd5)
+		crypto_free_shash(server->secmech.hmacmd5);
+
+	kfree(server->secmech.sdeschmacmd5);
+
+	kfree(server->secmech.sdescmd5);
+}
+
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+	if (!server->secmech.hmacmd5 ||
+			IS_ERR(server->secmech.hmacmd5)) {
+		cERROR(1, "could not allocate crypto hmacmd5\n");
+		return PTR_ERR(server->secmech.hmacmd5);
+	}
+
+	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+	if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+		cERROR(1, "could not allocate crypto md5\n");
+		rc = PTR_ERR(server->secmech.md5);
+		goto crypto_allocate_md5_fail;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacmd5);
+	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+		rc = -ENOMEM;
+		goto crypto_allocate_hmacmd5_sdesc_fail;
+	}
+	server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+	server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.md5);
+	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+		rc = -ENOMEM;
+		goto crypto_allocate_md5_sdesc_fail;
+	}
+	server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+	server->secmech.sdescmd5->shash.flags = 0x0;
+
+	return 0;
+
+crypto_allocate_md5_sdesc_fail:
+	kfree(server->secmech.sdeschmacmd5);
+
+crypto_allocate_hmacmd5_sdesc_fail:
+	crypto_free_shash(server->secmech.md5);
+
+crypto_allocate_md5_fail:
+	crypto_free_shash(server->secmech.hmacmd5);
+
+	return rc;
+}
+
 void CalcNTLMv2_response(const struct cifsSesInfo *ses)
 {
 	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 66f76b2d270b..7ca5f6d8ed80 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
+
 /*
  * The sizes of various internal tables and strings
  */
@@ -102,6 +105,27 @@ struct session_key {
 	char *response;
 };
 
+/* crypto security descriptor definition */
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+/* crypto hashing related structure/fields, not speicific to a sec mech */
+struct cifs_secmech {
+	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+	struct crypto_shash *md5; /* md5 hash function */
+	struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+};
+
+/* per smb connection structure/fields */
+struct ntlmssp_auth {
+	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+};
+
 struct cifs_cred {
 	int uid;
 	int gid;
@@ -178,6 +202,7 @@ struct TCP_Server_Info {
 	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
 	u16 dialect; /* dialect index that server chose */
+	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
 	/* extended security flavors that server supports */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
@@ -220,6 +245,7 @@ struct cifsSesInfo {
 	char ntlmv2_hash[16];
 	unsigned int tilen; /* length of the target info blob */
 	unsigned char *tiblob; /* target info blob in challenge response */
+	struct ntlmssp_auth ntlmssp; /* ciphertext, flags */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index a152cd6db99b..de36b09763a8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -140,6 +140,12 @@
  */
 #define CIFS_SESS_KEY_SIZE (16)
 
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
+
 /*
  * Maximum user name length
  */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8c2d0cf1a62f..1e4728bcf065 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -368,6 +368,10 @@ extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
 extern int setup_ntlm_response(struct cifsSesInfo *);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *);
 extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifsSesInfo *);
+
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
 				bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index dd9a4ae1d21d..04239a7ff320 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1563,6 +1563,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
 
+	cifs_crypto_shash_release(server);
 	cifs_fscache_release_client_cookie(server);
 
 	kfree(server->session_key.response);
@@ -1621,10 +1622,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err;
 	}
 
+	rc = cifs_crypto_shash_allocate(tcp_ses);
+	if (rc) {
+		cERROR(1, "could not setup hash structures rc %d", rc);
+		goto out_err;
+	}
+
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
 		rc = PTR_ERR(tcp_ses->hostname);
-		goto out_err;
+		goto out_err2;
 	}
 
 	tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1668,7 +1675,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	}
 	if (rc < 0) {
 		cERROR(1, "Error connecting to socket. Aborting operation");
-		goto out_err;
+		goto out_err2;
 	}
 
 	/*
@@ -1682,7 +1689,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		rc = PTR_ERR(tcp_ses->tsk);
 		cERROR(1, "error %d create cifsd thread", rc);
 		module_put(THIS_MODULE);
-		goto out_err;
+		goto out_err2;
 	}
 
 	/* thread spawned, put it on the list */
@@ -1694,6 +1701,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	return tcp_ses;
 
+out_err2:
+	cifs_crypto_shash_release(tcp_ses);
+
 out_err:
 	if (tcp_ses) {
 		if (!IS_ERR(tcp_ses->hostname))
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index b2934683bd08..d998c4f7aae5 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -404,7 +404,7 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	/* In particular we can examine sign flags */
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
-
+	ses->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags);
 	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
 	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
 	ses->tilen = tilen;
@@ -440,10 +440,12 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
 		NTLMSSP_NEGOTIATE_NTLM;
 	if (ses->server->secMode &
-	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-	if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
-		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+		if (!ses->server->session_estab)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
+				NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	}
 
 	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
 
@@ -543,9 +545,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->WorkstationName.MaximumLength = 0;
 	tmp += 2;
 
-	sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	sec_blob->SessionKey.Length = 0;
-	sec_blob->SessionKey.MaximumLength = 0;
+	if ((ses->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+			!calc_seckey(ses)) {
+		memcpy(tmp, ses->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.MaximumLength =
+				cpu_to_le16(CIFS_CPHTXT_SIZE);
+		tmp += CIFS_CPHTXT_SIZE;
+	} else {
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = 0;
+		sec_blob->SessionKey.MaximumLength = 0;
+	}
 
 setup_ntlmv2_ret:
 	*buflen = tmp - pbuffer;
-- 
cgit v1.2.3


From a178d2027d3198b0a04517d764326ab71cd73da2 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 25 Oct 2010 14:41:59 -0400
Subject: IMA: move read counter into struct inode

IMA currently allocated an inode integrity structure for every inode in
core.  This stucture is about 120 bytes long.  Most files however
(especially on a system which doesn't make use of IMA) will never need
any of this space.  The problem is that if IMA is enabled we need to
know information about the number of readers and the number of writers
for every inode on the box.  At the moment we collect that information
in the per inode iint structure and waste the rest of the space.  This
patch moves those counters into the struct inode so we can eventually
stop allocating an IMA integrity structure except when absolutely
needed.

This patch does the minimum needed to move the location of the data.
Further cleanups, especially the location of counter updates, may still
be possible.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c                        |  1 +
 include/linux/fs.h                |  4 ++++
 security/integrity/ima/ima.h      |  3 +--
 security/integrity/ima/ima_api.c  |  2 +-
 security/integrity/ima/ima_iint.c | 11 +++++------
 security/integrity/ima/ima_main.c | 35 ++++++++++-------------------------
 6 files changed, 22 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..56d909d69bc8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/ima.h>
 
 /*
  * This is needed for the following functions:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b7..01e3a0047fed 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -776,6 +776,10 @@ struct inode {
 
 	unsigned int		i_flags;
 
+#ifdef CONFIG_IMA
+	/* protected by i_lock */
+	unsigned int		i_readcount; /* struct files open RO */
+#endif
 	atomic_t		i_writecount;
 #ifdef CONFIG_SECURITY
 	void			*i_security;
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index b546b90f5fa8..27849e1656dc 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -70,6 +70,7 @@ int ima_init(void);
 void ima_cleanup(void);
 int ima_fs_init(void);
 void ima_fs_cleanup(void);
+int ima_inode_alloc(struct inode *inode);
 int ima_add_template_entry(struct ima_template_entry *entry, int violation,
 			   const char *op, struct inode *inode);
 int ima_calc_hash(struct file *file, char *digest);
@@ -106,8 +107,6 @@ struct ima_iint_cache {
 	unsigned char flags;
 	u8 digest[IMA_DIGEST_SIZE];
 	struct mutex mutex;	/* protects: version, flags, digest */
-	/* protected by inode->i_lock */
-	unsigned int readcount;	/* measured files readcount */
 	struct kref refcount;	/* ima_iint_cache reference count */
 };
 
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 52015d098fdf..d3963de6003d 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -116,7 +116,7 @@ int ima_must_measure(struct ima_iint_cache *iint, struct inode *inode,
 {
 	int must_measure;
 
-	if (iint->flags & IMA_MEASURED)
+	if (iint && iint->flags & IMA_MEASURED)
 		return 1;
 
 	must_measure = ima_match_policy(inode, function, mask);
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index e68891f8d55a..0936a7197e47 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -124,11 +124,6 @@ void iint_free(struct kref *kref)
 						   refcount);
 	iint->version = 0;
 	iint->flags = 0UL;
-	if (iint->readcount != 0) {
-		printk(KERN_INFO "%s: readcount: %u\n", __func__,
-		       iint->readcount);
-		iint->readcount = 0;
-	}
 	kref_init(&iint->refcount);
 	kmem_cache_free(iint_cache, iint);
 }
@@ -143,6 +138,11 @@ void ima_inode_free(struct inode *inode)
 {
 	struct ima_iint_cache *iint;
 
+	if (inode->i_readcount)
+		printk(KERN_INFO "%s: readcount: %u\n", __func__, inode->i_readcount);
+
+	inode->i_readcount = 0;
+
 	spin_lock(&ima_iint_lock);
 	iint = __ima_iint_find(inode);
 	if (iint)
@@ -160,7 +160,6 @@ static void init_once(void *foo)
 	iint->version = 0;
 	iint->flags = 0UL;
 	mutex_init(&iint->mutex);
-	iint->readcount = 0;
 	kref_init(&iint->refcount);
 }
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 24660bf3f82a..2a77b14fee27 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -85,17 +85,6 @@ out:
 	return found;
 }
 
-/*
- * Update the counts given an fmode_t
- */
-static void ima_inc_counts(struct ima_iint_cache *iint, fmode_t mode)
-{
-	assert_spin_locked(&iint->inode->i_lock);
-
-	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
-		iint->readcount++;
-}
-
 /*
  * ima_counts_get - increment file counts
  *
@@ -112,27 +101,23 @@ void ima_counts_get(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	fmode_t mode = file->f_mode;
-	struct ima_iint_cache *iint;
 	int rc;
 	bool send_tomtou = false, send_writers = false;
 
-	if (!iint_initialized || !S_ISREG(inode->i_mode))
+	if (!S_ISREG(inode->i_mode))
 		return;
-	iint = ima_iint_find_get(inode);
-	if (!iint)
-		return;
-	mutex_lock(&iint->mutex);
+
 	spin_lock(&inode->i_lock);
 
 	if (!ima_initialized)
 		goto out;
 
-	rc = ima_must_measure(iint, inode, MAY_READ, FILE_CHECK);
+	rc = ima_must_measure(NULL, inode, MAY_READ, FILE_CHECK);
 	if (rc < 0)
 		goto out;
 
 	if (mode & FMODE_WRITE) {
-		if (iint->readcount)
+		if (inode->i_readcount)
 			send_tomtou = true;
 		goto out;
 	}
@@ -140,10 +125,10 @@ void ima_counts_get(struct file *file)
 	if (atomic_read(&inode->i_writecount) > 0)
 		send_writers = true;
 out:
-	ima_inc_counts(iint, file->f_mode);
+	/* remember the vfs deals with i_writecount */
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		inode->i_readcount++;
 	spin_unlock(&inode->i_lock);
-	mutex_unlock(&iint->mutex);
-	kref_put(&iint->refcount, iint_free);
 
 	if (send_tomtou)
 		ima_add_violation(inode, dentry->d_name.name, "invalid_pcr",
@@ -166,9 +151,9 @@ static void ima_dec_counts(struct ima_iint_cache *iint, struct inode *inode,
 	assert_spin_locked(&inode->i_lock);
 
 	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
-		if (unlikely(iint->readcount == 0))
+		if (unlikely(inode->i_readcount == 0))
 			dump = true;
-		iint->readcount--;
+		inode->i_readcount--;
 	}
 	if (mode & FMODE_WRITE) {
 		if (atomic_read(&inode->i_writecount) <= 0)
@@ -180,7 +165,7 @@ static void ima_dec_counts(struct ima_iint_cache *iint, struct inode *inode,
 
 	if (dump && !ima_limit_imbalance(file)) {
 		printk(KERN_INFO "%s: open/free imbalance (r:%u)\n",
-		       __func__, iint->readcount);
+		       __func__, inode->i_readcount);
 		dump_stack();
 	}
 }
-- 
cgit v1.2.3


From 307fbd31b61623ad1b5388b452118f8aea99f9d0 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 21 Oct 2010 14:25:17 -0500
Subject: NTLM auth and sign - Use kernel crypto apis to calculate hashes and
 smb signatures

Use kernel crypto sync hash apis insetead of cifs crypto functions.
The calls typically corrospond one to one except that insead of
key init, setkey is used.

Use crypto apis to generate smb signagtures also.
Use hmac-md5 to genereate ntlmv2 hash, ntlmv2 response, and HMAC (CR1 of
ntlmv2 auth blob.
User crypto apis to genereate signature and to verify signature.
md5 hash is used to calculate signature.
Use secondary key to calculate signature in case of ntlmssp.

For ntlmv2 within ntlmssp, during signature calculation, only 16 bytes key
(a nonce) stored within session key is used. during smb signature calculation.
For ntlm and ntlmv2 without extended security, 16 bytes key
as well as entire response (24 bytes in case of ntlm and variable length
in case of ntlmv2) is used for smb signature calculation.
For kerberos, there is no distinction between key and response.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 195 +++++++++++++++++++++++++++++++++++---------------
 fs/cifs/cifsproto.h   |   1 -
 2 files changed, 136 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index eaa2327ee7af..96908874a45c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -45,17 +45,30 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 				struct TCP_Server_Info *server, char *signature)
 {
-	struct	MD5Context context;
+	int rc;
 
 	if (cifs_pdu == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
-	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, server->session_key.response,
-			server->session_key.len);
-	cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "%s: Can't generate signature\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Oould not init md5\n", __func__);
+		return rc;
+	}
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
 
-	cifs_MD5_final(signature, &context);
 	return 0;
 }
 
@@ -92,15 +105,26 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 				struct TCP_Server_Info *server, char *signature)
 {
-	struct  MD5Context context;
 	int i;
+	int rc;
 
 	if (iov == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
-	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, server->session_key.response,
-				server->session_key.len);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "%s: Can't generate signature\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Oould not init md5\n", __func__);
+		return rc;
+	}
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
@@ -113,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
-			cifs_MD5_update(&context, iov[0].iov_base+4,
-				  iov[0].iov_len-4);
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
 		} else
-			cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base, iov[i].iov_len);
 	}
 
-	cifs_MD5_final(signature, &context);
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
 
-	return 0;
+	return rc;
 }
 
-
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 		   __u32 *pexpected_response_sequence_number)
 {
@@ -420,67 +444,120 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 {
 	int rc = 0;
 	int len;
-	char nt_hash[16];
-	struct HMACMD5Context *pctxt;
+	char nt_hash[CIFS_NTHASH_SIZE];
 	wchar_t *user;
 	wchar_t *domain;
+	wchar_t *server;
 
-	pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
-
-	if (pctxt == NULL)
-		return -ENOMEM;
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
 
 	/* calculate md4 hash of password */
 	E_md4hash(ses->password, nt_hash);
 
-	/* convert Domainname to unicode and uppercase */
-	hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+	crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+				CIFS_NTHASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+		return rc;
+	}
 
 	/* convert ses->userName to unicode and uppercase */
 	len = strlen(ses->userName);
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
-	if (user == NULL)
+	if (user == NULL) {
+		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+		rc = -ENOMEM;
 		goto calc_exit_2;
+	}
 	len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
 	UniStrupr(user);
-	hmac_md5_update((char *)user, 2*len, pctxt);
+
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+				(char *)user, 2 * len);
 
 	/* convert ses->domainName to unicode and uppercase */
 	if (ses->domainName) {
 		len = strlen(ses->domainName);
 
 		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-		if (domain == NULL)
+		if (domain == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+			rc = -ENOMEM;
 			goto calc_exit_1;
+		}
 		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
 					nls_cp);
-		/* the following line was removed since it didn't work well
-		   with lower cased domain name that passed as an option.
-		   Maybe converting the domain name earlier makes sense */
-		/* UniStrupr(domain); */
-
-		hmac_md5_update((char *)domain, 2*len, pctxt);
-
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)domain, 2 * len);
 		kfree(domain);
+	} else if (ses->serverName) {
+		len = strlen(ses->serverName);
+
+		server = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (server == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+			rc = -ENOMEM;
+			goto calc_exit_1;
+		}
+		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+					nls_cp);
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)server, 2 * len);
+		kfree(server);
 	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+					ses->ntlmv2_hash);
+
 calc_exit_1:
 	kfree(user);
 calc_exit_2:
-	/* BB FIXME what about bytes 24 through 40 of the signing key?
-	   compare with the NTLM example */
-	hmac_md5_final(ses->ntlmv2_hash, pctxt);
+	return rc;
+}
+
+static int
+CalcNTLMv2_response(const struct cifsSesInfo *ses)
+{
+	int rc;
+	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
+
+	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+				ses->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+		return rc;
+	}
+
+	memcpy(ses->auth_key.response + offset,
+		ses->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + offset, ses->auth_key.len - offset);
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + CIFS_SESS_KEY_SIZE);
 
-	kfree(pctxt);
 	return rc;
 }
 
+
 int
 setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
 	int rc;
 	int baselen;
 	struct ntlmv2_resp *buf;
-	struct HMACMD5Context context;
 
 	if (ses->server->secType == RawNTLMSSP) {
 		if (!ses->domainName) {
@@ -523,13 +600,28 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 		cERROR(1, "could not get v2 hash rc %d", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
-	CalcNTLMv2_response(ses);
+	rc = CalcNTLMv2_response(ses);
+	if (rc) {
+		cERROR(1, "Could not calculate CR1  rc: %d", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
 
 	/* now calculate the session key for NTLMv2 */
-	hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
-	hmac_md5_update(ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-				16, &context);
-	hmac_md5_final(ses->auth_key.response, &context);
+	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+		ses->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+		CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response);
 
 	return 0;
 
@@ -653,18 +745,3 @@ crypto_allocate_md5_fail:
 
 	return rc;
 }
-
-void CalcNTLMv2_response(const struct cifsSesInfo *ses)
-{
-	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
-	struct HMACMD5Context context;
-
-	/* rest of v2 struct already generated */
-	memcpy(ses->auth_key.response + offset, ses->cryptKey, 8);
-	hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
-
-	hmac_md5_update(ses->auth_key.response + offset,
-			ses->auth_key.len - offset, &context);
-
-	hmac_md5_final(ses->auth_key.response + CIFS_SESS_KEY_SIZE, &context);
-}
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1e4728bcf065..edb6d90efdf2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -366,7 +366,6 @@ extern int cifs_verify_signature(struct smb_hdr *,
 				__u32 expected_sequence_number);
 extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
 extern int setup_ntlm_response(struct cifsSesInfo *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *);
 extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
-- 
cgit v1.2.3


From 1b627d5771312c92404b66f0a0b16f66036dd2e1 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:21:18 -0700
Subject: hostfs: fix UML crash: remove f_spare from hostfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

365b1818 ("add f_flags to struct statfs(64)") resized f_spare within
struct statfs which caused a UML crash.  There is no need to copy f_spare.

Signed-off-by: Richard Weinberger <richard@nod.at>
Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Tested-by: Toralf Förster <toralf.foerster@gmx.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs.h      | 3 +--
 fs/hostfs/hostfs_kern.c | 2 +-
 fs/hostfs/hostfs_user.c | 9 ++-------
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 7c232c1487ee..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
 		     long long *files_out, long long *ffree_out,
-		     void *fsid_out, int fsid_size, long *namelen_out,
-		     long *spare_out);
+		     void *fsid_out, int fsid_size, long *namelen_out);
 
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..cd7c93917cc7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 	err = do_statfs(dentry->d_sb->s_fs_info,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-			&sf->f_namelen, sf->f_spare);
+			&sf->f_namelen);
 	if (err)
 		return err;
 	sf->f_blocks = f_blocks;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..8d02683585e0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -364,8 +364,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	      long long *bfree_out, long long *bavail_out,
 	      long long *files_out, long long *ffree_out,
-	      void *fsid_out, int fsid_size, long *namelen_out,
-	      long *spare_out)
+	      void *fsid_out, int fsid_size, long *namelen_out)
 {
 	struct statfs64 buf;
 	int err;
@@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	       sizeof(buf.f_fsid) > fsid_size ? fsid_size :
 	       sizeof(buf.f_fsid));
 	*namelen_out = buf.f_namelen;
-	spare_out[0] = buf.f_spare[0];
-	spare_out[1] = buf.f_spare[1];
-	spare_out[2] = buf.f_spare[2];
-	spare_out[3] = buf.f_spare[3];
-	spare_out[4] = buf.f_spare[4];
+
 	return 0;
 }
-- 
cgit v1.2.3


From a4f7326da2bfe788b85509ec0c261e64596cdde4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 26 Oct 2010 14:21:21 -0700
Subject: vmcore: it is not experimental any more

We use vmcore in our production kernel for a long time, it is pretty
stable now.  So I don't think we need to mark it as experimental any more.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..6a0068841d96 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
 	depends on PROC_FS && MMU
 
 config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && CRASH_DUMP
+	bool "/proc/vmcore support"
+	depends on PROC_FS && CRASH_DUMP
 	default y
         help
         Exports the dump image of crashed kernel in ELF format.
-- 
cgit v1.2.3


From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001
From: Ying Han <yinghan@google.com>
Date: Tue, 26 Oct 2010 14:21:23 -0700
Subject: oom: add per-mm oom disable count

It's pointless to kill a task if another thread sharing its mm cannot be
killed to allow future memory freeing.  A subsequent patch will prevent
kills in such cases, but first it's necessary to have a way to flag a task
that shares memory with an OOM_DISABLE task that doesn't incur an
additional tasklist scan, which would make select_bad_process() an O(n^2)
function.

This patch adds an atomic counter to struct mm_struct that follows how
many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN.
They cannot be killed by the kernel, so their memory cannot be freed in
oom conditions.

This only requires task_lock() on the task that we're operating on, it
does not require mm->mmap_sem since task_lock() pins the mm and the
operation is atomic.

[rientjes@google.com: changelog and sys_unshare() code]
[rientjes@google.com: protect oom_disable_count with task_lock in fork]
[rientjes@google.com: use old_mm for oom_disable_count in exec]
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                |  5 +++++
 fs/proc/base.c           | 30 ++++++++++++++++++++++++++++++
 include/linux/mm_types.h |  2 ++
 kernel/exit.c            |  3 +++
 kernel/fork.c            | 15 ++++++++++++++-
 5 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..3aa75b8888a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -759,6 +760,10 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		atomic_dec(&old_mm->oom_disable_count);
+		atomic_inc(&tsk->mm->oom_disable_count);
+	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5d5f51f3fe..6e50c8e65513 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	if (oom_adjust != task->signal->oom_adj) {
+		if (oom_adjust == OOM_DISABLE)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_adj == OOM_DISABLE)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
+
 	/*
 	 * Warn that /proc/pid/oom_adj is deprecated, see
 	 * Documentation/feature-removal-schedule.txt.
@@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+	task_unlock(task);
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+	if (oom_score_adj != task->signal->oom_score_adj) {
+		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
 	task->signal->oom_score_adj = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
+	task_unlock(task);
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 	return count;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cb57d657ce4d..bb7288a782fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -310,6 +310,8 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
+	/* How many tasks sharing this mm are OOM_DISABLE */
+	atomic_t oom_disable_count;
 };
 
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..894179a32ec1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk)
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..e87aaaaf5131 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -741,6 +743,8 @@ good_mm:
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
+	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		atomic_inc(&mm->oom_disable_count);
 
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -1299,8 +1303,13 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm)
+	if (p->mm) {
+		task_lock(p);
+		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&p->mm->oom_disable_count);
+		task_unlock(p);
 		mmput(p->mm);
+	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
@@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 			active_mm = current->active_mm;
 			current->mm = new_mm;
 			current->active_mm = new_mm;
+			if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+				atomic_dec(&mm->oom_disable_count);
+				atomic_inc(&new_mm->oom_disable_count);
+			}
 			activate_mm(active_mm, new_mm);
 			new_mm = mm;
 		}
-- 
cgit v1.2.3


From 723548bff1dde9ab6bdb23f4bb92277c4da49473 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 26 Oct 2010 14:21:25 -0700
Subject: oom: rewrite error handling for oom_adj and oom_score_adj tunables

It's better to use proper error handling in oom_adjust_write() and
oom_score_adj_write() instead of duplicating the locking order on various
exit paths.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 83 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6e50c8e65513..34d11ac31f2e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1023,36 +1023,39 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
 
 	err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
 	if (err)
-		return -EINVAL;
+		goto out;
 	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-	     oom_adjust != OOM_DISABLE)
-		return -EINVAL;
+	     oom_adjust != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
 	if (!lock_task_sighand(task, &flags)) {
-		put_task_struct(task);
-		return -ESRCH;
+		err = -ESRCH;
+		goto err_task_struct;
 	}
 
 	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EACCES;
+		err = -EACCES;
+		goto err_sighand;
 	}
 
 	task_lock(task);
 	if (!task->mm) {
-		task_unlock(task);
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_task_lock;
 	}
 
 	if (oom_adjust != task->signal->oom_adj) {
@@ -1080,11 +1083,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+err_task_lock:
 	task_unlock(task);
+err_sighand:
 	unlock_task_sighand(task, &flags);
+err_task_struct:
 	put_task_struct(task);
-
-	return count;
+out:
+	return err < 0 ? err : count;
 }
 
 static const struct file_operations proc_oom_adjust_operations = {
@@ -1125,36 +1131,39 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
 
 	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
 	if (err)
-		return -EINVAL;
+		goto out;
 	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
-			oom_score_adj > OOM_SCORE_ADJ_MAX)
-		return -EINVAL;
+			oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
 	if (!lock_task_sighand(task, &flags)) {
-		put_task_struct(task);
-		return -ESRCH;
+		err = -ESRCH;
+		goto err_task_struct;
 	}
 	if (oom_score_adj < task->signal->oom_score_adj &&
 			!capable(CAP_SYS_RESOURCE)) {
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EACCES;
+		err = -EACCES;
+		goto err_sighand;
 	}
 
 	task_lock(task);
 	if (!task->mm) {
-		task_unlock(task);
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_task_lock;
 	}
 	if (oom_score_adj != task->signal->oom_score_adj) {
 		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
@@ -1172,10 +1181,14 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
+err_task_lock:
 	task_unlock(task);
+err_sighand:
 	unlock_task_sighand(task, &flags);
+err_task_struct:
 	put_task_struct(task);
-	return count;
+out:
+	return err < 0 ? err : count;
 }
 
 static const struct file_operations proc_oom_score_adj_operations = {
-- 
cgit v1.2.3


From d19d5476f4b9f91d2de92b91588bb118beba6c0d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 26 Oct 2010 14:21:26 -0700
Subject: oom: fix locking for oom_adj and oom_score_adj

The locking order in oom_adjust_write() and oom_score_adj_write() for
task->alloc_lock and task->sighand->siglock is reversed, and lockdep
notices that irqs could encounter an ABBA scenario.

This fixes the locking order so that we always take task_lock(task) prior
to lock_task_sighand(task).

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 34d11ac31f2e..53dc8ad40ae6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1042,9 +1042,16 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		err = -ESRCH;
 		goto out;
 	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
 	if (!lock_task_sighand(task, &flags)) {
 		err = -ESRCH;
-		goto err_task_struct;
+		goto err_task_lock;
 	}
 
 	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
@@ -1052,12 +1059,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		goto err_sighand;
 	}
 
-	task_lock(task);
-	if (!task->mm) {
-		err = -EINVAL;
-		goto err_task_lock;
-	}
-
 	if (oom_adjust != task->signal->oom_adj) {
 		if (oom_adjust == OOM_DISABLE)
 			atomic_inc(&task->mm->oom_disable_count);
@@ -1083,11 +1084,10 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
-err_task_lock:
-	task_unlock(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
-err_task_struct:
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
 out:
 	return err < 0 ? err : count;
@@ -1150,21 +1150,24 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		err = -ESRCH;
 		goto out;
 	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
 	if (!lock_task_sighand(task, &flags)) {
 		err = -ESRCH;
-		goto err_task_struct;
+		goto err_task_lock;
 	}
+
 	if (oom_score_adj < task->signal->oom_score_adj &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
 	}
 
-	task_lock(task);
-	if (!task->mm) {
-		err = -EINVAL;
-		goto err_task_lock;
-	}
 	if (oom_score_adj != task->signal->oom_score_adj) {
 		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
 			atomic_inc(&task->mm->oom_disable_count);
@@ -1181,11 +1184,10 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 							OOM_SCORE_ADJ_MAX;
-err_task_lock:
-	task_unlock(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
-err_task_struct:
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
 out:
 	return err < 0 ? err : count;
-- 
cgit v1.2.3


From 1b430beee5e388605dfb092b214ef0320f752cf6 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 26 Oct 2010 14:21:26 -0700
Subject: writeback: remove nonblocking/encountered_congestion references

This removes more dead code that was somehow missed by commit 0d99519efef
(writeback: remove unused nonblocking and congestion checks).  There are
no behavior change except for the removal of two entries from one of the
ext4 tracing interface.

The nonblocking checks in ->writepages are no longer used because the
flusher now prefer to block on get_request_wait() than to skip inodes on
IO congestion.  The latter will lead to more seeky IO.

The nonblocking checks in ->writepage are no longer used because it's
redundant with the WB_SYNC_NONE check.

We no long set ->nonblocking in VM page out and page migration, because
a) it's effectively redundant with WB_SYNC_NONE in current code
b) it's old semantic of "Don't get stuck on request queues" is mis-behavior:
   that would skip some dirty inodes on congestion and page out others, which
   is unfair in terms of LRU age.

Inspired by Christoph Hellwig. Thanks!

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: David Howells <dhowells@redhat.com>
Cc: Sage Weil <sage@newdream.net>
Cc: Steve French <sfrench@samba.org>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/write.c                   | 19 +------------------
 fs/buffer.c                      |  2 +-
 fs/ceph/addr.c                   |  9 ---------
 fs/cifs/file.c                   | 10 ----------
 fs/gfs2/meta_io.c                |  2 +-
 fs/nfs/write.c                   |  4 +---
 fs/reiserfs/inode.c              |  2 +-
 fs/xfs/linux-2.6/xfs_aops.c      |  3 +--
 include/trace/events/ext4.h      |  8 +++++---
 include/trace/events/writeback.h |  2 --
 mm/migrate.c                     |  1 -
 mm/vmscan.c                      |  1 -
 12 files changed, 11 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
  */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = page->mapping->backing_dev_info;
 	struct afs_writeback *wb;
 	int ret;
 
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	wbc->nr_to_write -= ret;
-	if (wbc->nonblocking && bdi_write_congested(bdi))
-		wbc->encountered_congestion = 1;
 
 	_leave(" = 0");
 	return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
 				 struct writeback_control *wbc,
 				 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct afs_writeback *wb;
 	struct page *page;
 	int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
 
 		wbc->nr_to_write -= ret;
 
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
-			wbc->encountered_congestion = 1;
-			break;
-		}
-
 		cond_resched();
 	} while (index < end && wbc->nr_to_write > 0);
 
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
 int afs_writepages(struct address_space *mapping,
 		   struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	pgoff_t start, end, next;
 	int ret;
 
 	_enter("");
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		_leave(" = 0 [congest]");
-		return 0;
-	}
-
 	if (wbc->range_cyclic) {
 		start = mapping->writeback_index;
 		end = -1;
 		ret = afs_writepages_region(mapping, wbc, start, end, &next);
-		if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
-		    !(wbc->nonblocking && wbc->encountered_congestion))
+		if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
 			ret = afs_writepages_region(mapping, wbc, 0, start,
 						    &next);
 		mapping->writeback_index = next;
diff --git a/fs/buffer.c b/fs/buffer.c
index 7f0b9b083f77..ec21a92e08b4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1706,7 +1706,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		 * and kswapd activity, but those code paths have their own
 		 * higher-level throttling.
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 51bcc5ce3230..e9c874abc9e1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping,
 				 struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc;
 	pgoff_t index, start, end;
@@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping,
 
 	pagevec_init(&pvec, 0);
 
-	/* ?? */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		dout(" writepages congested\n");
-		wbc->encountered_congestion = 1;
-		goto out_final;
-	}
-
 	/* where to start/end? */
 	if (wbc->range_cyclic) {
 		start = mapping->writeback_index; /* Start from prev offset */
@@ -885,7 +877,6 @@ out:
 		rc = 0;  /* vfs expects us to return 0 */
 	ceph_put_snap_context(snapc);
 	dout("writepages done, rc = %d\n", rc);
-out_final:
 	return rc;
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8c81e7b14d53..45af003865d2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1303,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned int bytes_to_write;
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
@@ -1326,15 +1325,6 @@ static int cifs_writepages(struct address_space *mapping,
 	int scanned = 0;
 	int xid, long_op;
 
-	/*
-	 * BB: Is this meaningful for a non-block-device file system?
-	 * If it is, we should test it again after we do I/O
-	 */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
 	/*
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 		 * activity, but those code paths have their own higher-level
 		 * throttling.
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 605e292501f4..4c14c17a5276 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -290,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
 	nfs_pageio_cond_complete(pgio, page->index);
-	ret = nfs_page_async_flush(pgio, page,
-			wbc->sync_mode == WB_SYNC_NONE ||
-			wbc->nonblocking != 0);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..c1f93896cb53 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2438,7 +2438,7 @@ static int reiserfs_write_full_page(struct page *page,
 		/* from this point on, we know the buffer is mapped to a
 		 * real block and not a direct item
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else {
 			if (!trylock_buffer(bh)) {
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..c9af48fffcd7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1139,8 +1139,7 @@ xfs_vm_writepage(
 				type = IO_DELAY;
 				flags = BMAPI_ALLOCATE;
 
-				if (wbc->sync_mode == WB_SYNC_NONE &&
-				    wbc->nonblocking)
+				if (wbc->sync_mode == WB_SYNC_NONE)
 					flags |= BMAPI_TRYLOCK;
 			}
 
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 01e9e0076a92..6bcb00645de4 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -242,18 +242,20 @@ TRACE_EVENT(ext4_da_writepages,
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
-		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
 
-	TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
+	TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld "
+		  "range_start %llu range_end %llu "
+		  "for_kupdate %d for_reclaim %d "
+		  "range_cyclic %d writeback_index %lu",
 		  jbd2_dev_to_name(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->nr_to_write,
 		  __entry->pages_skipped, __entry->range_start,
-		  __entry->range_end, __entry->nonblocking,
+		  __entry->range_end,
 		  __entry->for_kupdate, __entry->for_reclaim,
 		  __entry->range_cyclic,
 		  (unsigned long) __entry->writeback_index)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index f345f66ae9d1..0bb01ab2e984 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -96,8 +96,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(long, nr_to_write)
 		__field(long, pages_skipped)
 		__field(int, sync_mode)
-		__field(int, nonblocking)
-		__field(int, encountered_congestion)
 		__field(int, for_kupdate)
 		__field(int, for_background)
 		__field(int, for_reclaim)
diff --git a/mm/migrate.c b/mm/migrate.c
index f8c9bccf2520..d917ac3207f5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -497,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
 		.nr_to_write = 1,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
-		.nonblocking = 1,
 		.for_reclaim = 1
 	};
 	int rc;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94c9464f262..6cbc1aac23ae 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -376,7 +376,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			.nr_to_write = SWAP_CLUSTER_MAX,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
-			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
 
-- 
cgit v1.2.3


From f629d1c9bd0dbc44a6c4f9a4a67d1646c42bfc6f Mon Sep 17 00:00:00 2001
From: Michael Rubin <mrubin@google.com>
Date: Tue, 26 Oct 2010 14:21:33 -0700
Subject: mm: add account_page_writeback()

To help developers and applications gain visibility into writeback
behaviour this patch adds two counters to /proc/vmstat.

  # grep nr_dirtied /proc/vmstat
  nr_dirtied 3747
  # grep nr_written /proc/vmstat
  nr_written 3618

These entries allow user apps to understand writeback behaviour over time
and learn how it is impacting their performance.  Currently there is no
way to inspect dirty and writeback speed over time.  It's not possible for
nr_dirty/nr_writeback.

These entries are necessary to give visibility into writeback behaviour.
We have /proc/diskstats which lets us understand the io in the block
layer.  We have blktrace for more in depth understanding.  We have
e2fsprogs and debugsfs to give insight into the file systems behaviour,
but we don't offer our users the ability understand what writeback is
doing.  There is no way to know how active it is over the whole system, if
it's falling behind or to quantify it's efforts.  With these values
exported users can easily see how much data applications are sending
through writeback and also at what rates writeback is processing this
data.  Comparing the rates of change between the two allow developers to
see when writeback is not able to keep up with incoming traffic and the
rate of dirty memory being sent to the IO back end.  This allows folks to
understand their io workloads and track kernel issues.  Non kernel
engineers at Google often use these counters to solve puzzling performance
problems.

Patch #4 adds a pernode vmstat file with nr_dirtied and nr_written

Patch #5 add writeback thresholds to /proc/vmstat

Currently these values are in debugfs. But they should be promoted to
/proc since they are useful for developers who are writing databases
and file servers and are not debugging the kernel.

The output is as below:

 # grep threshold /proc/vmstat
 nr_pages_dirty_threshold 409111
 nr_pages_dirty_background_threshold 818223

This patch:

This allows code outside of the mm core to safely manipulate page
writeback state and not worry about the other accounting.  Not using these
routines means that some code will lose track of the accounting and we get
bugs.

Modify nilfs2 to use interface.

Signed-off-by: Michael Rubin <mrubin@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: Jiro SEKIBA <jir@unicus.jp>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c |  2 +-
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 13 ++++++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index d926af626177..687d090cea34 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1609,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (!TestSetPageWriteback(clone_page))
-		inc_zone_page_state(clone_page, NR_WRITEBACK);
+		account_page_writeback(clone_page);
 	unlock_page(clone_page);
 
 	return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a4c66846fb8f..c36297faf7cb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -868,6 +868,7 @@ int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
 void account_page_dirtied(struct page *page, struct address_space *mapping);
+void account_page_writeback(struct page *page);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..94159819a651 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1128,6 +1128,17 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 }
 EXPORT_SYMBOL(account_page_dirtied);
 
+/*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+	inc_zone_page_state(page, NR_WRITEBACK);
+}
+EXPORT_SYMBOL(account_page_writeback);
+
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
@@ -1366,7 +1377,7 @@ int test_set_page_writeback(struct page *page)
 		ret = TestSetPageWriteback(page);
 	}
 	if (!ret)
-		inc_zone_page_state(page, NR_WRITEBACK);
+		account_page_writeback(page);
 	return ret;
 
 }
-- 
cgit v1.2.3


From 4cbec4c8b9fda9ec784086fe7f74cd32a8adda95 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 26 Oct 2010 14:21:45 -0700
Subject: writeback: remove the internal 5% low bound on dirty_ratio

The dirty_ratio was silently limited in global_dirty_limits() to >= 5%.
This is not a user expected behavior.  And it's inconsistent with
calc_period_shift(), which uses the plain vm_dirty_ratio value.

Let's remove the internal bound.

At the same time, fix balance_dirty_pages() to work with the
dirty_thresh=0 case.  This allows applications to proceed when
dirty+writeback pages are all cleaned.

And ">" fits with the name "exceeded" better than ">=" does.  Neil thinks
it is an aesthetic improvement as well as a functional one :)

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jan Kara <jack@suse.cz>
Proposed-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c   |  2 +-
 mm/page-writeback.c | 16 +++++-----------
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..97d2951bd4d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -582,7 +582,7 @@ static inline bool over_bground_thresh(void)
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
 	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4dd91f7fd39f..b840afa89761 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 
 	if (vm_dirty_bytes)
 		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-	else {
-		int dirty_ratio;
-
-		dirty_ratio = vm_dirty_ratio;
-		if (dirty_ratio < 5)
-			dirty_ratio = 5;
-		dirty = (dirty_ratio * available_memory) / 100;
-	}
+	else
+		dirty = (vm_dirty_ratio * available_memory) / 100;
 
 	if (dirty_background_bytes)
 		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <
+		if (nr_reclaimable + nr_writeback <=
 				(background_thresh + dirty_thresh) / 2)
 			break;
 
@@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * the last resort safeguard.
 		 */
 		dirty_exceeded =
-			(bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
-			|| (nr_reclaimable + nr_writeback >= dirty_thresh);
+			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+			|| (nr_reclaimable + nr_writeback > dirty_thresh);
 
 		if (!dirty_exceeded)
 			break;
-- 
cgit v1.2.3


From 74ce002d9aee23031b4967e1dd1c1966ddc60749 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:05 -0700
Subject: fs/fs-writeback.c: restore lost comment

I had to go back to a 2.6.20 tree to work out why we're adding a
number-of-inodes into a number-of-pages count.  Restore the lost comment.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 97d2951bd4d1..b5aae4bd0aca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -721,6 +721,10 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 		return 0;
 
 	wb->last_old_flush = jiffies;
+	/*
+	 * Add in the number of potentially dirty inodes, because each inode
+	 * write can dirty pagecache in the underlying blockdev.
+	 */
 	nr_pages = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-- 
cgit v1.2.3


From c5c6dd4e2dce3cb4d705d97183bc42b4e33e2606 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 26 Oct 2010 14:22:20 -0700
Subject: hostfs: code cleanups

Some code cleanups for hostfs.

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs_user.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 8d02683585e0..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
 
 	dir = opendir(path);
 	*err_out = errno;
-	if (dir == NULL)
-		return NULL;
+
 	return dir;
 }
 
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 	if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
 		if (fd >= 0) {
 			if (fchmod(fd, attrs->ia_mode) != 0)
-				return (-errno);
+				return -errno;
 		} else if (chmod(file, attrs->ia_mode) != 0) {
 			return -errno;
 		}
-- 
cgit v1.2.3


From 3ecb01df3261d3b1f02ccfcf8384e2a255d2a1d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 26 Oct 2010 14:22:27 -0700
Subject: use clear_page()/copy_page() in favor of memset()/memcpy() on whole
 pages

After all that's what they are intended for.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c           |  2 +-
 kernel/kexec.c          |  2 +-
 kernel/power/snapshot.c | 14 +++++++-------
 kernel/power/swap.c     |  6 +++---
 mm/memory.c             |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..69037118b2c9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -811,7 +811,7 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 
 	if (page && zeroing && count < PAGE_SIZE) {
 		void *mapaddr = kmap_atomic(page, KM_USER1);
-		memset(mapaddr, 0, PAGE_SIZE);
+		clear_page(mapaddr);
 		kunmap_atomic(mapaddr, KM_USER1);
 	}
 	while (count) {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 
 		ptr = kmap(page);
 		/* Start with a clear page */
-		memset(ptr, 0, PAGE_SIZE);
+		clear_page(ptr);
 		ptr += maddr & ~PAGE_MASK;
 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 		if (mchunk > mbytes)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9e3581f4619a..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 			 */
 			safe_copy_page(buffer, s_page);
 			dst = kmap_atomic(d_page, KM_USER0);
-			memcpy(dst, buffer, PAGE_SIZE);
+			copy_page(dst, buffer);
 			kunmap_atomic(dst, KM_USER0);
 		} else {
 			safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 		memory_bm_position_reset(&orig_bm);
 		memory_bm_position_reset(&copy_bm);
 	} else if (handle->cur <= nr_meta_pages) {
-		memset(buffer, 0, PAGE_SIZE);
+		clear_page(buffer);
 		pack_pfns(buffer, &orig_bm);
 	} else {
 		struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 			void *kaddr;
 
 			kaddr = kmap_atomic(page, KM_USER0);
-			memcpy(buffer, kaddr, PAGE_SIZE);
+			copy_page(buffer, kaddr);
 			kunmap_atomic(kaddr, KM_USER0);
 			handle->buffer = buffer;
 		} else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
 		void *dst;
 
 		dst = kmap_atomic(last_highmem_page, KM_USER0);
-		memcpy(dst, buffer, PAGE_SIZE);
+		copy_page(dst, buffer);
 		kunmap_atomic(dst, KM_USER0);
 		last_highmem_page = NULL;
 	}
@@ -2270,9 +2270,9 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 
 	kaddr1 = kmap_atomic(p1, KM_USER0);
 	kaddr2 = kmap_atomic(p2, KM_USER1);
-	memcpy(buf, kaddr1, PAGE_SIZE);
-	memcpy(kaddr1, kaddr2, PAGE_SIZE);
-	memcpy(kaddr2, buf, PAGE_SIZE);
+	copy_page(buf, kaddr1);
+	copy_page(kaddr1, kaddr2);
+	copy_page(kaddr2, buf);
 	kunmap_atomic(kaddr2, KM_USER1);
 	kunmap_atomic(kaddr1, KM_USER0);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	if (bio_chain) {
 		src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
 		if (src) {
-			memcpy(src, buf, PAGE_SIZE);
+			copy_page(src, buf);
 		} else {
 			WARN_ON_ONCE(1);
 			bio_chain = NULL;	/* Go synchronous */
@@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		error = write_page(handle->cur, handle->cur_swap, NULL);
 		if (error)
 			goto out;
-		memset(handle->cur, 0, PAGE_SIZE);
+		clear_page(handle->cur);
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
@@ -910,7 +910,7 @@ int swsusp_check(void)
 	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
-		memset(swsusp_header, 0, PAGE_SIZE);
+		clear_page(swsusp_header);
 		error = hib_bio_read_page(swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
diff --git a/mm/memory.c b/mm/memory.c
index 861f7982dd54..02e48aa0ed13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2080,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 		 * zeroes.
 		 */
 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
-			memset(kaddr, 0, PAGE_SIZE);
+			clear_page(kaddr);
 		kunmap_atomic(kaddr, KM_USER0);
 		flush_dcache_page(dst);
 	} else
-- 
cgit v1.2.3


From b6777c40c79168d938c30b5b7471fbd64bca109c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 26 Oct 2010 14:22:27 -0700
Subject: fuse: use clear_highpage() and KM_USER0 instead of KM_USER1

Commit 7909b1c640 ("fuse: don't use atomic kmap") removed KM_USER0 usage
from fuse/dev.c.  Switch KM_USER1 uses to KM_USER0 for clarity.  Also
replace open coded clear_highpage().

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 69037118b2c9..b98664275f02 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 	int err;
 	struct page *page = *pagep;
 
-	if (page && zeroing && count < PAGE_SIZE) {
-		void *mapaddr = kmap_atomic(page, KM_USER1);
-		clear_page(mapaddr);
-		kunmap_atomic(mapaddr, KM_USER1);
-	}
+	if (page && zeroing && count < PAGE_SIZE)
+		clear_highpage(page);
+
 	while (count) {
 		if (cs->write && cs->pipebufs && page) {
 			return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 			}
 		}
 		if (page) {
-			void *mapaddr = kmap_atomic(page, KM_USER1);
+			void *mapaddr = kmap_atomic(page, KM_USER0);
 			void *buf = mapaddr + offset;
 			offset += fuse_copy_do(cs, &buf, &count);
-			kunmap_atomic(mapaddr, KM_USER1);
+			kunmap_atomic(mapaddr, KM_USER0);
 		} else
 			offset += fuse_copy_do(cs, NULL, &count);
 	}
-- 
cgit v1.2.3


From cd1c584f380183aca268d454c14b22d8454eac4f Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Tue, 26 Oct 2010 14:22:28 -0700
Subject: fs/direct-io.c: fix truncation error in dio_complete() return

Fix up truncation (ssize_t->int).  This only matters with >2G
reads/writes, which the kernel doesn't permit.

Signed-off-by: Edward Shishkin <edward@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..85882f6ba5f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
 	ssize_t transferred = 0;
 
-- 
cgit v1.2.3


From 1df79da85657aecde2ecff052ff0cf9910311078 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 26 Oct 2010 14:22:31 -0700
Subject: fs/buffer.c: remove duplicated assignment to b_private

bh->b_private is initialized within init_buffer(), thus this assignment is
redundant.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index ec21a92e08b4..8d595ab2aed1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -905,7 +905,6 @@ try_again:
 
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
-		bh->b_private = NULL;
 		bh->b_size = size;
 
 		/* Link the buffer to its page */
-- 
cgit v1.2.3


From 4199ca77cc44c418972e8f30a36be7d26d21a0d2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 26 Oct 2010 14:22:32 -0700
Subject: fs: move exportfs since it is not a networking filesystem

Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block
since it provides a library function that can be (and is) used by other
(non-network) filesystems.

This also eliminates a kconfig dependency warning:

warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 65781de44fc0..b5e582bd769d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
+config EXPORTFS
+	tristate
+
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
@@ -222,9 +225,6 @@ config LOCKD_V4
 	depends on FILE_LOCKING
 	default y
 
-config EXPORTFS
-	tristate
-
 config NFS_ACL_SUPPORT
 	tristate
 	select FS_POSIX_ACL
-- 
cgit v1.2.3


From 99dc829256bb8cfcb1f58b7f118893fdbf608e60 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 26 Oct 2010 14:22:33 -0700
Subject: procfs: fix numbering in /proc/locks

The lock number in /proc/locks (first field) is implemented by a counter
(private field of struct seq_file) which is incremented at each call of
locks_show() and reset to 1 in locks_start() whatever the offset is.  It
should be reset according to the actual position in the list.  Because of
this, the numbering erratically restarts at 1 several times when reading a
long /proc/locks file.

Moreover, locks_show() can be called twice to print a single line thus
skipping a number.  The counter should be incremented in locks_next().

And last, pos is a loff_t, which can be bigger than a pointer, so we don't
use the pointer as an integer anymore, and allocate a loff_t instead.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 8b2b6ad56a09..4de3a2666810 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2109,7 +2109,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/seq_file.h>
 
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
-							int id, char *pfx)
+			    loff_t id, char *pfx)
 {
 	struct inode *inode = NULL;
 	unsigned int fl_pid;
@@ -2122,7 +2122,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	if (fl->fl_file != NULL)
 		inode = fl->fl_file->f_path.dentry->d_inode;
 
-	seq_printf(f, "%d:%s ", id, pfx);
+	seq_printf(f, "%lld:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
 		seq_printf(f, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2185,24 +2185,27 @@ static int locks_show(struct seq_file *f, void *v)
 
 	fl = list_entry(v, struct file_lock, fl_link);
 
-	lock_get_status(f, fl, (long)f->private, "");
+	lock_get_status(f, fl, *((loff_t *)f->private), "");
 
 	list_for_each_entry(bfl, &fl->fl_block, fl_block)
-		lock_get_status(f, bfl, (long)f->private, " ->");
+		lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
 
-	f->private++;
 	return 0;
 }
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
+	loff_t *p = f->private;
+
 	lock_flocks();
-	f->private = (void *)1;
+	*p = (*pos + 1);
 	return seq_list_start(&file_lock_list, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
+	loff_t *p = f->private;
+	++*p;
 	return seq_list_next(v, &file_lock_list, pos);
 }
 
@@ -2220,14 +2223,14 @@ static const struct seq_operations locks_seq_operations = {
 
 static int locks_open(struct inode *inode, struct file *filp)
 {
-	return seq_open(filp, &locks_seq_operations);
+	return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
 }
 
 static const struct file_operations proc_locks_operations = {
 	.open		= locks_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
 static int __init proc_locks_init(void)
-- 
cgit v1.2.3


From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 26 Oct 2010 14:22:44 -0700
Subject: fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot.  The kernel was incapable
of creating either a named pipe or unix domain socket.  This comes down
to a common kernel function called unix_create1() which does:

        atomic_inc(&unix_nr_socks);
        if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000).  That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of atomic_t.

get_max_files() is changed to return an unsigned long.  get_nr_files() is
changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.

Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704     0       2147483648

Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c    | 17 +++++++----------
 include/linux/fs.h |  8 ++++----
 kernel/sysctl.c    |  6 +++---
 net/unix/af_unix.c | 14 +++++++-------
 4 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
  * Return the total number of open files in the system
  */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
 	return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
  * Return the maximum number of open files in the system
  */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
 	return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
 	const struct cred *cred = current_cred();
-	static int old_max;
+	static long old_max;
 	struct file * f;
 
 	/*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
 	/* Ran out of filps - report that */
 	if (get_nr_files() > old_max) {
-		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					get_max_files());
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
 		old_max = get_nr_files();
 	}
 	goto fail;
@@ -487,7 +486,7 @@ retry:
 
 void __init files_init(unsigned long mempages)
 { 
-	int n; 
+	unsigned long n;
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
 	 */ 
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
-	files_stat.max_files = n; 
-	if (files_stat.max_files < NR_FILE)
-		files_stat.max_files = NR_FILE;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
 	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f34ff6e5558..b2cdb6bc8287 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,9 +34,9 @@
 
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
-	int nr_files;		/* read only */
-	int nr_free_files;	/* read only */
-	int max_files;		/* tunable */
+	unsigned long nr_files;		/* read only */
+	unsigned long nr_free_files;	/* read only */
+	unsigned long max_files;		/* tunable */
 };
 
 struct inodes_stat_t {
@@ -400,7 +400,7 @@ extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 
 extern struct files_stat_struct files_stat;
-extern int get_max_files(void);
+extern unsigned long get_max_files(void);
 extern int sysctl_nr_open;
 extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..694b140852c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "file-nr",
 		.data		= &files_stat,
-		.maxlen		= 3*sizeof(int),
+		.maxlen		= sizeof(files_stat),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_files,
 	},
 	{
 		.procname	= "file-max",
 		.data		= &files_stat.max_files,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(files_stat.max_files),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "nr_open",
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0ebc777a6660..3c95304a0817 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
-static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
 	if (u->addr)
 		unix_release_addr(u->addr);
 
-	atomic_dec(&unix_nr_socks);
+	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 #ifdef UNIX_REFCNT_DEBUG
-	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk,
-		atomic_read(&unix_nr_socks));
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
 #endif
 }
 
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	atomic_inc(&unix_nr_socks);
-	if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
-		atomic_dec(&unix_nr_socks);
+		atomic_long_dec(&unix_nr_socks);
 	else {
 		local_bh_disable();
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-- 
cgit v1.2.3


From 766f9164193f6dda1497bbf3861060198421fb92 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:22:45 -0700
Subject: kernel: remove PF_FLUSHER

PF_FLUSHER is only ever set, not tested, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c     | 2 +-
 include/linux/sched.h | 1 -
 mm/backing-dev.c      | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b5aae4bd0aca..9e46aec10d1a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -794,7 +794,7 @@ int bdi_writeback_thread(void *data)
 	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	current->flags |= PF_SWAPWRITE;
 	set_freezable();
 	wb->last_active = jiffies;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56154bbb8da9..393ce94e54b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1706,7 +1706,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
-#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
 #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
 #define PF_FREEZING	0x00004000	/* freeze in progress. do not account to load */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5ad3c106606b..f2eb27884ffa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
 
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	current->flags |= PF_SWAPWRITE;
 	set_freezable();
 
 	/*
-- 
cgit v1.2.3


From f7c5445a9deecffea8a4fffc0163bf582411ac8a Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Tue, 26 Oct 2010 18:10:24 -0500
Subject: NTLM auth and sign - minor error corrections and cleanup

Minor cleanup - Fix spelling mistake, make meaningful (goto) label

In function setup_ntlmv2_rsp(), do not return 0 and leak memory,
let the tiblob get freed.

For function find_domain_name(), pass already available nls table pointer
instead of loading and unloading the table again in this function.

For ntlmv2, the case sensitive password length is the length of the
response, so subtract session key length (16 bytes) from the .len.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 15 ++++++---------
 fs/cifs/cifsglob.h    |  2 +-
 fs/cifs/connect.c     |  8 ++++----
 fs/cifs/sess.c        |  2 +-
 4 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 96908874a45c..17d603ad5e34 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -391,7 +391,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
  * about target string i.e. for some, just user name might suffice.
  */
 static int
-find_domain_name(struct cifsSesInfo *ses)
+find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
 	unsigned int attrsize;
 	unsigned int type;
@@ -420,16 +420,13 @@ find_domain_name(struct cifsSesInfo *ses)
 			if (!attrsize)
 				break;
 			if (!ses->domainName) {
-				struct nls_table *default_nls;
 				ses->domainName =
 					kmalloc(attrsize + 1, GFP_KERNEL);
 				if (!ses->domainName)
 						return -ENOMEM;
-				default_nls = load_nls_default();
 				cifs_from_ucs2(ses->domainName,
 					(__le16 *)blobptr, attrsize, attrsize,
-					default_nls, false);
-				unload_nls(default_nls);
+					nls_cp, false);
 				break;
 			}
 		}
@@ -561,7 +558,7 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 
 	if (ses->server->secType == RawNTLMSSP) {
 		if (!ses->domainName) {
-			rc = find_domain_name(ses);
+			rc = find_domain_name(ses, nls_cp);
 			if (rc) {
 				cERROR(1, "error %d finding domain name", rc);
 				goto setup_ntlmv2_rsp_ret;
@@ -594,12 +591,14 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 
 	memcpy(ses->auth_key.response + baselen, ses->tiblob, ses->tilen);
 
-	/* calculate buf->ntlmv2_hash */
+	/* calculate ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, nls_cp);
 	if (rc) {
 		cERROR(1, "could not get v2 hash rc %d", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
+
+	/* calculate first part of the client response (CR1) */
 	rc = CalcNTLMv2_response(ses);
 	if (rc) {
 		cERROR(1, "Could not calculate CR1  rc: %d", rc);
@@ -623,8 +622,6 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
 		ses->auth_key.response);
 
-	return 0;
-
 setup_ntlmv2_rsp_ret:
 	kfree(ses->tiblob);
 	ses->tiblob = NULL;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7ca5f6d8ed80..67d6a2280a01 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -111,7 +111,7 @@ struct sdesc {
 	char ctx[];
 };
 
-/* crypto hashing related structure/fields, not speicific to a sec mech */
+/* crypto hashing related structure/fields, not specific to a sec mech */
 struct cifs_secmech {
 	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
 	struct crypto_shash *md5; /* md5 hash function */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 04239a7ff320..469c3ddba463 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1631,7 +1631,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
 		rc = PTR_ERR(tcp_ses->hostname);
-		goto out_err2;
+		goto out_err_crypto_release;
 	}
 
 	tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1675,7 +1675,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	}
 	if (rc < 0) {
 		cERROR(1, "Error connecting to socket. Aborting operation");
-		goto out_err2;
+		goto out_err_crypto_release;
 	}
 
 	/*
@@ -1689,7 +1689,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		rc = PTR_ERR(tcp_ses->tsk);
 		cERROR(1, "error %d create cifsd thread", rc);
 		module_put(THIS_MODULE);
-		goto out_err2;
+		goto out_err_crypto_release;
 	}
 
 	/* thread spawned, put it on the list */
@@ -1701,7 +1701,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	return tcp_ses;
 
-out_err2:
+out_err_crypto_release:
 	cifs_crypto_shash_release(tcp_ses);
 
 out_err:
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d998c4f7aae5..e0515a62715d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -738,7 +738,7 @@ ssetup_ntlmssp_authenticate:
 		 * assigned, tilen is 0 otherwise.
 		 */
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(ses->auth_key.len);
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 
 		if (ses->capabilities & CAP_UNICODE) {
 			if (iov[0].iov_len % 2) {
-- 
cgit v1.2.3


From 85b8fe8cc47b0dc1068475ba95f29ddff10a8efc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Wed, 27 Oct 2010 13:45:50 +0200
Subject: hfsplus: free space correcly for files unlinked while open

hfsplus_delete_inode only truncates away all block allocations if
i_nlink is zero.  Make sure we properly drop the unlink count even
when doing the rename hack for open but unlinked files.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85ec9d7..e490aaf35174 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -317,8 +317,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		res = hfsplus_rename_cat(inode->i_ino,
 					 dir, &dentry->d_name,
 					 sbi->hidden_dir, &str);
-		if (!res)
+		if (!res) {
 			inode->i_flags |= S_DEAD;
+			drop_nlink(inode);
+		}
 		goto out;
 	}
 	res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
-- 
cgit v1.2.3


From 763641d81202834e9d64de2019d1edec12868f4f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 26 Oct 2010 22:55:40 +0200
Subject: lockd: push lock_flocks down

lockd should use lock_flocks() instead of lock_kernel()
to lock against posix locks accessing the i_flock list.

This is a prerequisite to turning lock_flocks into a
spinlock.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c     | 11 -----------
 fs/lockd/svcsubs.c |  9 ++++++++-
 fs/nfs/Kconfig     |  1 -
 fs/nfsd/Kconfig    |  1 -
 4 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index b13aabc12298..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
 #include <linux/in.h>
 #include <linux/uio.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
 
 	dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
 
-	/*
-	 * FIXME: it would be nice if lockd didn't spend its entire life
-	 * running under the BKL. At the very least, it would be good to
-	 * have someone clarify what it's intended to protect here. I've
-	 * seen some handwavy posts about posix locking needing to be
-	 * done under the BKL, but it's far from clear.
-	 */
-	lock_kernel();
-
 	if (!nlm_timeout)
 		nlm_timeout = LOCKD_DFLT_TIMEO;
 	nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 
 again:
 	file->f_locks = 0;
+	lock_flocks(); /* protects i_flock list */
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
@@ -181,6 +182,7 @@ again:
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
+			unlock_flocks();
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
 			goto again;
 		}
 	}
+	unlock_flocks();
 
 	return 0;
 }
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
 
+	lock_flocks();
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-		if (fl->fl_lmops == &nlmsvc_lock_operations)
+		if (fl->fl_lmops == &nlmsvc_lock_operations) {
+			unlock_flocks();
 			return 1;
+		}
 	}
+	unlock_flocks();
 	file->f_locks = 0;
 	return 0;
 }
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index fd667652c502..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,7 +1,6 @@
 config NFS_FS
 	tristate "NFS client support"
 	depends on INET && FILE_LOCKING
-	depends on BKL # fix as soon as lockd is done
 	select LOCKD
 	select SUNRPC
 	select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 31a78fce4732..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -2,7 +2,6 @@ config NFSD
 	tristate "NFS server support"
 	depends on INET
 	depends on FILE_LOCKING
-	depends on BKL # fix as soon as lockd is done
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
-- 
cgit v1.2.3


From a282a1fa6b23bd21ba0b86e53ed2a316b001836f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 26 Oct 2010 18:25:30 -0400
Subject: lockd: fix nlmsvc_notify_blocked locking

nlmsvc_notify_blocked walks the nlm_blocked list,
which requires nlm_blocked_lock.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/lockd/svclock.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6f1ef000975a..c462d346acbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -700,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
 	struct nlm_block	*block;
 
 	dprintk("lockd: VFS unblock notification for block %p\n", fl);
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
-			nlmsvc_insert_block(block, 0);
+			nlmsvc_insert_block_locked(block, 0);
+			spin_unlock(&nlm_blocked_lock);
 			svc_wake_up(block->b_daemon);
 			return;
 		}
 	}
-
+	spin_unlock(&nlm_blocked_lock);
 	printk(KERN_WARNING "lockd: notification for unknown block!\n");
 }
 
-- 
cgit v1.2.3


From c5b1f0d92c36851aca09ac6c7c0c4f9690ac14f3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 27 Oct 2010 15:46:08 +0200
Subject: locks/nfsd: allocate file lock outside of spinlock

As suggested by Christoph Hellwig, this moves allocation
of new file locks out of generic_setlease into the
callers, nfs4_open_delegation and fcntl_setlease in order
to allow GFP_KERNEL allocations when lock_flocks has
become a spinlock.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c          | 36 ++++++++++++------------------------
 fs/nfsd/nfs4state.c | 26 +++++++++++++++-----------
 include/linux/fs.h  |  1 +
 3 files changed, 28 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 8b2b6ad56a09..0391d2ff5a4e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -162,10 +162,11 @@ EXPORT_SYMBOL_GPL(unlock_flocks);
 static struct kmem_cache *filelock_cache __read_mostly;
 
 /* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(void)
 {
 	return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
 
 void locks_release_private(struct file_lock *fl)
 {
@@ -1365,7 +1366,6 @@ int fcntl_getlease(struct file *filp)
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
-	struct file_lock *new_fl = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	int error, rdlease_count = 0, wrlease_count = 0;
@@ -1385,11 +1385,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	lease = *flp;
 
 	if (arg != F_UNLCK) {
-		error = -ENOMEM;
-		new_fl = locks_alloc_lock();
-		if (new_fl == NULL)
-			goto out;
-
 		error = -EAGAIN;
 		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 			goto out;
@@ -1434,7 +1429,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		goto out;
 	}
 
-	error = 0;
 	if (arg == F_UNLCK)
 		goto out;
 
@@ -1442,15 +1436,11 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	if (!leases_enable)
 		goto out;
 
-	locks_copy_lock(new_fl, lease);
-	locks_insert_lock(before, new_fl);
-
-	*flp = new_fl;
+	locks_insert_lock(before, lease);
 	return 0;
 
 out:
-	if (new_fl != NULL)
-		locks_free_lock(new_fl);
+	locks_free_lock(lease);
 	return error;
 }
 EXPORT_SYMBOL(generic_setlease);
@@ -1514,26 +1504,24 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
  */
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
-	struct file_lock fl, *flp = &fl;
+	struct file_lock *fl;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int error;
 
-	locks_init_lock(&fl);
-	error = lease_init(filp, arg, &fl);
-	if (error)
-		return error;
+	fl = lease_alloc(filp, arg);
+	if (IS_ERR(fl))
+		return PTR_ERR(fl);
 
 	lock_flocks();
-
-	error = __vfs_setlease(filp, arg, &flp);
+	error = __vfs_setlease(filp, arg, &fl);
 	if (error || arg == F_UNLCK)
 		goto out_unlock;
 
-	error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
+	error = fasync_helper(fd, filp, 1, &fl->fl_fasync);
 	if (error < 0) {
 		/* remove lease just inserted by setlease */
-		flp->fl_type = F_UNLCK | F_INPROGRESS;
-		flp->fl_break_time = jiffies - 10;
+		fl->fl_type = F_UNLCK | F_INPROGRESS;
+		fl->fl_break_time = jiffies - 10;
 		time_out_leases(inode);
 		goto out_unlock;
 	}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9019e8ec9dc8..56347e0ac88d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2614,7 +2614,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
 	int cb_up = atomic_read(&sop->so_client->cl_cb_set);
-	struct file_lock fl, *flp = &fl;
+	struct file_lock *fl;
 	int status, flag = 0;
 
 	flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2648,20 +2648,24 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 		flag = NFS4_OPEN_DELEGATE_NONE;
 		goto out;
 	}
-	locks_init_lock(&fl);
-	fl.fl_lmops = &nfsd_lease_mng_ops;
-	fl.fl_flags = FL_LEASE;
-	fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-	fl.fl_end = OFFSET_MAX;
-	fl.fl_owner =  (fl_owner_t)dp;
-	fl.fl_file = find_readable_file(stp->st_file);
-	BUG_ON(!fl.fl_file);
-	fl.fl_pid = current->tgid;
+	status = -ENOMEM;
+	fl = locks_alloc_lock();
+	if (!fl)
+		goto out;
+	locks_init_lock(fl);
+	fl->fl_lmops = &nfsd_lease_mng_ops;
+	fl->fl_flags = FL_LEASE;
+	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner =  (fl_owner_t)dp;
+	fl->fl_file = find_readable_file(stp->st_file);
+	BUG_ON(!fl->fl_file);
+	fl->fl_pid = current->tgid;
 
 	/* vfs_setlease checks to see if delegation should be handed out.
 	 * the lock_manager callbacks fl_mylease and fl_change are used
 	 */
-	if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
+	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
 		unhash_delegation(dp);
 		flag = NFS4_OPEN_DELEGATE_NONE;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bb20373d0b46..8d7de08ab546 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1113,6 +1113,7 @@ extern int fcntl_getlease(struct file *filp);
 
 /* fs/locks.c */
 extern void locks_init_lock(struct file_lock *);
+extern struct file_lock * locks_alloc_lock(void);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
-- 
cgit v1.2.3


From f7347ce4ee7c65415f84be915c018473e7076f31 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 27 Oct 2010 12:38:12 -0400
Subject: fasync: re-organize fasync entry insertion to allow it under a
 spinlock

You currently cannot use "fasync_helper()" in an atomic environment to
insert a new fasync entry, because it will need to allocate the new
"struct fasync_struct".

Yet fcntl_setlease() wants to call this under lock_flocks(), which is in
the process of being converted from the BKL to a spinlock.

In order to fix this, this abstracts out the actual fasync list
insertion and the fasync allocations into functions of their own, and
teaches fs/locks.c to pre-allocate the fasync_struct entry.  That way
the actual list insertion can happen while holding the required
spinlock.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bfields@redhat.com: rebase on top of my changes to Arnd's patch]
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/fcntl.c         | 66 +++++++++++++++++++++++++++++++++++++++++-------------
 fs/locks.c         | 18 ++++++++++++++-
 include/linux/fs.h |  5 +++++
 3 files changed, 72 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..dcdbc6f5c33b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
  * match the state "is the filp on a fasync list".
  *
  */
-static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
 	struct fasync_struct *fa, **fp;
 	int result = 0;
@@ -666,21 +666,28 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 	return result;
 }
 
+struct fasync_struct *fasync_alloc(void)
+{
+	return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
+
 /*
- * Add a fasync entry. Return negative on error, positive if
- * added, and zero if did nothing but change an existing one.
- *
- * NOTE! It is very important that the FASYNC flag always
- * match the state "is the filp on a fasync list".
+ * NOTE! This can be used only for unused fasync entries:
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
  */
-static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+void fasync_free(struct fasync_struct *new)
 {
-	struct fasync_struct *new, *fa, **fp;
-	int result = 0;
+	kmem_cache_free(fasync_cache, new);
+}
 
-	new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
+ */
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
+{
+        struct fasync_struct *fa, **fp;
 
 	spin_lock(&filp->f_lock);
 	spin_lock(&fasync_lock);
@@ -691,8 +698,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
 		spin_lock_irq(&fa->fa_lock);
 		fa->fa_fd = fd;
 		spin_unlock_irq(&fa->fa_lock);
-
-		kmem_cache_free(fasync_cache, new);
 		goto out;
 	}
 
@@ -702,13 +707,42 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
 	new->fa_fd = fd;
 	new->fa_next = *fapp;
 	rcu_assign_pointer(*fapp, new);
-	result = 1;
 	filp->f_flags |= FASYNC;
 
 out:
 	spin_unlock(&fasync_lock);
 	spin_unlock(&filp->f_lock);
-	return result;
+	return fa;
+}
+
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+	struct fasync_struct *new;
+
+	new = fasync_alloc();
+	if (!new)
+		return -ENOMEM;
+
+	/*
+	 * fasync_insert_entry() returns the old (update) entry if
+	 * it existed.
+	 *
+	 * So free the (unused) new entry and return 0 to let the
+	 * caller know that we didn't add any new fasync entries.
+	 */
+	if (fasync_insert_entry(fd, filp, fapp, new)) {
+		fasync_free(new);
+		return 0;
+	}
+
+	return 1;
 }
 
 /*
diff --git a/fs/locks.c b/fs/locks.c
index 0391d2ff5a4e..85fd9ce1abae 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1505,6 +1505,7 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
 	struct file_lock *fl;
+	struct fasync_struct *new;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int error;
 
@@ -1512,12 +1513,25 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 	if (IS_ERR(fl))
 		return PTR_ERR(fl);
 
+	new = fasync_alloc();
+	if (!new) {
+		locks_free_lock(fl);
+		return -ENOMEM;
+	}
 	lock_flocks();
 	error = __vfs_setlease(filp, arg, &fl);
 	if (error || arg == F_UNLCK)
 		goto out_unlock;
 
-	error = fasync_helper(fd, filp, 1, &fl->fl_fasync);
+	/*
+	 * fasync_insert_entry() returns the old entry if any.
+	 * If there was no old entry, then it used 'new' and
+	 * inserted it into the fasync list. Clear new so that
+	 * we don't release it here.
+	 */
+	if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+		new = NULL;
+
 	if (error < 0) {
 		/* remove lease just inserted by setlease */
 		fl->fl_type = F_UNLCK | F_INPROGRESS;
@@ -1529,6 +1543,8 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 out_unlock:
 	unlock_flocks();
+	if (new)
+		fasync_free(new);
 	return error;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8d7de08ab546..56285e5e1de4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1302,6 +1302,11 @@ struct fasync_struct {
 
 /* SMP safe fasync helpers: */
 extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
+extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
+extern int fasync_remove_entry(struct file *, struct fasync_struct **);
+extern struct fasync_struct *fasync_alloc(void);
+extern void fasync_free(struct fasync_struct *);
+
 /* can be called from interrupts */
 extern void kill_fasync(struct fasync_struct **, int, int);
 
-- 
cgit v1.2.3


From 72f98e72551fad573c6cace8e8551ef094f482dd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 27 Oct 2010 21:39:58 +0200
Subject: locks: turn lock_flocks into a spinlock

Nothing depends on lock_flocks using the BKL
any more, so we can do the switch over to
a private spinlock.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/Kconfig | 1 -
 fs/locks.c | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 65781de44fc0..3d185308ec88 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,7 +50,6 @@ endif # BLOCK
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
-	select BKL # while lockd still uses it.
 	help
 	  This option enables standard file locking support, required
           for filesystems like NFS and for the flock() system
diff --git a/fs/locks.c b/fs/locks.c
index 85fd9ce1abae..74c3df99c0e1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,6 +142,7 @@ int lease_break_time = 45;
 
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
+static DEFINE_SPINLOCK(file_lock_lock);
 
 /*
  * Protects the two list heads above, plus the inode->i_flock list
@@ -149,13 +150,13 @@ static LIST_HEAD(blocked_list);
  */
 void lock_flocks(void)
 {
-	lock_kernel();
+	spin_lock(&file_lock_lock);
 }
 EXPORT_SYMBOL_GPL(lock_flocks);
 
 void unlock_flocks(void)
 {
-	unlock_kernel();
+	spin_unlock(&file_lock_lock);
 }
 EXPORT_SYMBOL_GPL(unlock_flocks);
 
-- 
cgit v1.2.3


From 26f78b7a423aea094c536e94cbe1f1fa4641dd7b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 26 Sep 2010 15:31:00 +0900
Subject: ext2: fix comment on ext2_try_to_allocate()

@handle doesn't exist in ext2. Remove it.
Also, fit comment header into kernel-doc format.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/balloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c6c684b44ea1..0d06f4e75699 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 	return here;
 }
 
-/*
+/**
  * ext2_try_to_allocate()
  * @sb:			superblock
- * @handle:		handle to this transaction
  * @group:		given allocation block group
  * @bitmap_bh:		bufferhead holds the block bitmap
  * @grp_goal:		given target block within the group
-- 
cgit v1.2.3


From c5639bef63a241d8d5a00e5243d7f304d7563c46 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 4 Oct 2010 17:26:53 +0900
Subject: jbd: Move debug message into #ifdef area

Move call to jbd_debug() into #ifdef CONFIG_JBD_DEBUG block because
'dropped' is declared there. The code could be compiled without this
change anyway, simply because jbd_debug() expands to nothing if
!CONFIG_JBD_DEBUG but IMHO it doesn't look good in general.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 81051dafebf5..5b43e96788e6 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD_DEBUG
 		int dropped = info.end_transaction -
 			      be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
 		jbd_debug(1,
 			  "JBD: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
+#endif
 		journal->j_transaction_sequence = ++info.end_transaction;
 	}
 
-- 
cgit v1.2.3


From f81e3d4564a06c5a070337ba527d11865acecb67 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 4 Oct 2010 19:12:13 +0900
Subject: jbd: Use printk_ratelimited() in journal_alloc_journal_head()

Use printk_ratelimited() instead of doing it manually.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2c4b1f109da9..d7a86935553a 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/ratelimit.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -1719,7 +1720,6 @@ static void journal_destroy_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
 	struct journal_head *ret;
-	static unsigned long last_warning;
 
 #ifdef CONFIG_JBD_DEBUG
 	atomic_inc(&nr_journal_heads);
@@ -1727,11 +1727,9 @@ static struct journal_head *journal_alloc_journal_head(void)
 	ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
 	if (ret == NULL) {
 		jbd_debug(1, "out of memory for journal_head\n");
-		if (time_after(jiffies, last_warning + 5*HZ)) {
-			printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-			       __func__);
-			last_warning = jiffies;
-		}
+		printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+				   __func__);
+
 		while (ret == NULL) {
 			yield();
 			ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
-- 
cgit v1.2.3


From 2b23976908d9f8406f8b10b86c83b2cee302bc7d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 5 Oct 2010 00:13:59 +0900
Subject: jbd: Remove unnecessary goto statement

Remove goto statement which jumps to very next line. Also remove
target label because it is no longer used anywhere.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a491..981449ca8f18 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		jbd_free_handle(handle);
 		current->journal_info = NULL;
 		handle = ERR_PTR(err);
-		goto out;
 	}
-out:
 	return handle;
 }
 
-- 
cgit v1.2.3


From 8117f98c058dbe463eaba2b51b84d19bd5d78804 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 5 Oct 2010 01:03:40 +0900
Subject: jbd: Use offset_in_page() instead of manual calculation

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 981449ca8f18..c9bb7a73f8fd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -711,7 +711,7 @@ done:
 		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
 			    "Possible IO failure.\n");
 		page = jh2bh(jh)->b_page;
-		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+		offset = offset_in_page(jh2bh(jh)->b_data);
 		source = kmap_atomic(page, KM_USER0);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
-- 
cgit v1.2.3


From dff6825e9fde93891e60751e01480337a991235e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Mon, 4 Oct 2010 12:35:05 -0700
Subject: ext3/jbd: Avoid WARN() messages when failing to write the superblock

This fixes a WARN backtrace in mark_buffer_dirty() that occurs during unmount
when the underlying block device is removed.  This bug has been seen on System
Z when removing all paths from a multipath-backed ext3 mount; on System P when
injecting enough PCI EEH errors to make the SCSI controller go offline; and
similar warnings have been seen (and patched) with ext2/ext4.

The super block update from a previous operation has marked the buffer as in
error, and the flag has to be cleared before doing the update. Similar changes
have been made to ext4 by commit 914258bf2cb22bf4336a1b1d90c551b4b11ca5aa.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c  | 24 +++++++++++++++++++++++-
 fs/jbd/journal.c | 30 ++++++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4dba03c4..3ef272488ac9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2361,6 +2361,21 @@ static int ext3_commit_super(struct super_block *sb,
 
 	if (!sbh)
 		return error;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext3_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
 	/*
 	 * If the file system is mounted read-only, don't update the
 	 * superblock write time.  This avoids updating the superblock
@@ -2377,8 +2392,15 @@ static int ext3_commit_super(struct super_block *sb,
 	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-	if (sync)
+	if (sync) {
 		error = sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			ext3_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
 	return error;
 }
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index d7a86935553a..e56117651826 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -85,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
 
 /*
  * Helper function used to manage commit timeouts
@@ -1011,6 +1012,23 @@ void journal_update_superblock(journal_t *journal, int wait)
 		goto out;
 	}
 
+	if (buffer_write_io_error(bh)) {
+		char b[BDEVNAME_SIZE];
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal_dev_name(journal, b));
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
 	spin_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1022,9 +1040,17 @@ void journal_update_superblock(journal_t *journal, int wait)
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
-	if (wait)
+	if (wait) {
 		sync_dirty_buffer(bh);
-	else
+		if (buffer_write_io_error(bh)) {
+			char b[BDEVNAME_SIZE];
+			printk(KERN_ERR "JBD: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal_dev_name(journal, b));
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
 		write_dirty_buffer(bh, WRITE);
 
 out:
-- 
cgit v1.2.3


From a910eefa511f9d1118effc13fba6773163502c4f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Fri, 8 Oct 2010 20:05:06 +0900
Subject: jbd: Convert bitops to buffer fns

Convert set/clear_bit(BH_JWrite, ...) to set/clear_buffer_jwrite()
for consistency.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/commit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..c8428323167f 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -617,7 +617,7 @@ void journal_commit_transaction(journal_t *journal)
                    (this will requeue both the metadata buffer and the
                    temporary IO buffer). new_bh goes on BJ_IO*/
 
-		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+		set_buffer_jwrite(jh2bh(jh));
 		/*
 		 * akpm: journal_write_metadata_buffer() sets
 		 * new_bh->b_transaction to commit_transaction.
@@ -627,7 +627,7 @@ void journal_commit_transaction(journal_t *journal)
 		JBUFFER_TRACE(jh, "ph3: write metadata");
 		flags = journal_write_metadata_buffer(commit_transaction,
 						      jh, &new_jh, blocknr);
-		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+		set_buffer_jwrite(jh2bh(new_jh));
 		wbuf[bufs++] = jh2bh(new_jh);
 
 		/* Record the new block's tag in the current descriptor
@@ -737,7 +737,7 @@ wait_for_iobuf:
                    shadowed buffer */
 		jh = commit_transaction->t_shadow_list->b_tprev;
 		bh = jh2bh(jh);
-		clear_bit(BH_JWrite, &bh->b_state);
+		clear_buffer_jwrite(bh);
 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
 
 		/* The metadata is now released for reuse, but we need
-- 
cgit v1.2.3


From 9e32784b71c2c84895016ca6ab271591669c02aa Mon Sep 17 00:00:00 2001
From: Dmitry <dmonakhov@openvz.org>
Date: Sat, 9 Oct 2010 23:15:30 +0400
Subject: quota: fix dquot_disable vs dquot_transfer race v2

I've got following lockup:
dquot_disable                              dquot_transfer
                                            ->dqget()
					       sb_has_quota_active
dqopt->flags &= ~dquot_state_flag(f, cnt)      atomic_inc(dq->dq_count)
 ->drop_dquot_ref(sb, cnt);
    down_write(dqptr_sem)
    inode->i_dquot[cnt] = NULL              ->__dquot_transfer
invalidate_dquots(sb, cnt);		       down_write(&dqptr_sem)
  ->wait for dq_wait_unused		       inode->i_dquot = new_dquot
  /* wait forever */                            ^^^^New quota user^^^^^^

We cannot allow new references to dquots from inodes after drop_dquot_ref()
has removed them.  We have to recheck quota state under dqptr_sem and before
assignment, as we do it in dquot_initialize().

Signed-off-by: Dmitry Monakhov <dmonakhov@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316a977f..ed14beea8fe3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1736,6 +1736,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	qsize_t rsv_space = 0;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
 	int cnt, ret = 0;
+	char is_valid[MAXQUOTAS] = {};
 	char warntype_to[MAXQUOTAS];
 	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
@@ -1757,8 +1758,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	space = cur_space + rsv_space;
 	/* Build the transfer_from list and check the limits */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		/*
+		 * Skip changes for same uid or gid or for turned off quota-type.
+		 */
 		if (!transfer_to[cnt])
 			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(inode->i_sb, cnt))
+			continue;
+		is_valid[cnt] = 1;
 		transfer_from[cnt] = inode->i_dquot[cnt];
 		ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
 		if (ret)
@@ -1772,12 +1780,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	 * Finally perform the needed transfer from transfer_from to transfer_to
 	 */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		/*
-		 * Skip changes for same uid or gid or for turned off quota-type.
-		 */
-		if (!transfer_to[cnt])
+		if (!is_valid[cnt])
 			continue;
-
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
 			warntype_from_inodes[cnt] =
@@ -1803,7 +1807,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	mark_all_dquot_dirty(transfer_to);
 	/* Pass back references to put */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_to[cnt] = transfer_from[cnt];
+		if (is_valid[cnt])
+			transfer_to[cnt] = transfer_from[cnt];
+
 warn:
 	flush_warnings(transfer_to, warntype_to);
 	flush_warnings(transfer_from, warntype_from_inodes);
-- 
cgit v1.2.3


From 86f3cbec4a193c04d0a31c13132c5956731af6ff Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Oct 2010 15:22:21 +0200
Subject: quota: Fix issuing of warnings from dquot_transfer

__dquot_transfer accidentally called flush_warnings for a wrong set of
dquots which could result in quota warnings being issued with a wrong
identification. Also when operation fails because of EDQUOT, there's no
need check for issuing information message about user getting below limits
(no transfer has actually happened).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ed14beea8fe3..1bc38f56fa7c 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1805,20 +1805,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 
 	mark_all_dquot_dirty(transfer_from);
 	mark_all_dquot_dirty(transfer_to);
+	flush_warnings(transfer_to, warntype_to);
+	flush_warnings(transfer_from, warntype_from_inodes);
+	flush_warnings(transfer_from, warntype_from_space);
 	/* Pass back references to put */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		if (is_valid[cnt])
 			transfer_to[cnt] = transfer_from[cnt];
-
-warn:
-	flush_warnings(transfer_to, warntype_to);
-	flush_warnings(transfer_from, warntype_from_inodes);
-	flush_warnings(transfer_from, warntype_from_space);
-	return ret;
+	return 0;
 over_quota:
 	spin_unlock(&dq_data_lock);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	goto warn;
+	flush_warnings(transfer_to, warntype_to);
+	return ret;
 }
 EXPORT_SYMBOL(__dquot_transfer);
 
-- 
cgit v1.2.3


From df0d6b8ff152b1a1aaae17c27a445ad025a358bd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 10 Oct 2010 21:36:59 +0900
Subject: ext3: Cleanup ext3_setup_super()

Fix mount-count check to emit warning only if s_max_mnt_count
is greater than 0 according to man tune2fs(8). Also removes
unnecessary casts.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3ef272488ac9..daf34b1c4fb5 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1306,9 +1306,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 		ext3_msg(sb, KERN_WARNING,
 			"warning: mounting fs with errors, "
 			"running e2fsck is recommended");
-	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
-		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+			le16_to_cpu(es->s_max_mnt_count))
 		ext3_msg(sb, KERN_WARNING,
 			"warning: maximal mount count reached, "
 			"running e2fsck is recommended");
@@ -1325,7 +1325,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                    valid forever! :) */
 	es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
 #endif
-	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+	if (!le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
-- 
cgit v1.2.3


From 57e94d8647e9aa60ad317ccd0cd54eefd603f1fe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 11 Oct 2010 02:10:45 +0900
Subject: ext3: Remove unnecessary casts on bh->b_data

bh->b_data is already a pointer to char so casts to 'char *' should
be meaningless. Remove them.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 4 ++--
 fs/ext3/super.c  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4ded..a49ce4a7c23d 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 	if (here < 0)
 		here = 0;
 
-	p = ((char *)bh->b_data) + (here >> 3);
+	p = bh->b_data + (here >> 3);
 	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
+	next = (r - bh->b_data) << 3;
 
 	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
 		return next;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index daf34b1c4fb5..af7aead1000a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1654,7 +1654,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 * Note: s_es must be initialized as soon as possible because
 	 *       some ext3 macro-instructions depend on its value
 	 */
-	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext3_super_block *) (bh->b_data + offset);
 	sbi->s_es = es;
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1765,7 +1765,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 			       "error: can't read superblock on 2nd try");
 			goto failed_mount;
 		}
-		es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+		es = (struct ext3_super_block *)(bh->b_data + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
 			ext3_msg(sb, KERN_ERR,
@@ -2168,7 +2168,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 		goto out_bdev;
 	}
 
-	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext3_super_block *) (bh->b_data + offset);
 	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-- 
cgit v1.2.3


From 4569cd1b0d91e4d7fa67f950201befc2acfecb34 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 11 Oct 2010 19:08:06 +0900
Subject: ext3: Return proper error code on ext3_fill_super()

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index af7aead1000a..1811c6fd5ba4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1871,6 +1871,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (sbi->s_group_desc == NULL) {
 		ext3_msg(sb, KERN_ERR,
 			"error: not enough memory");
+		ret = -ENOMEM;
 		goto failed_mount;
 	}
 
@@ -1958,6 +1959,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	}
 	if (err) {
 		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+		ret = err;
 		goto failed_mount3;
 	}
 
-- 
cgit v1.2.3


From 81a4e320e6ee29bde3fe880ab87b2122bc1de88b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 11 Oct 2010 19:38:39 +0900
Subject: ext3: Use DIV_ROUND_UP() on group desc block counting

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1811c6fd5ba4..1c6875118b1e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1864,8 +1864,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
 			       le32_to_cpu(es->s_first_data_block) - 1)
 				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-	db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
-		   EXT3_DESC_PER_BLOCK(sb);
+	db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
 	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-- 
cgit v1.2.3


From 2a0e33889b3f5e3c270dd1b746f4c0a75efa4790 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 13 Oct 2010 17:17:18 +0900
Subject: jbd: Check return value of __getblk()

Fail journal creation if __getblk() returns NULL.  unlikely() is
added because it is called in a loop and we've been OK without
the check until now.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e56117651826..ac1840415a65 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -952,6 +952,8 @@ int journal_create(journal_t *journal)
 		if (err)
 			return err;
 		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (unlikely(!bh))
+			return -ENOMEM;
 		lock_buffer(bh);
 		memset (bh->b_data, 0, journal->j_blocksize);
 		BUFFER_TRACE(bh, "marking dirty");
-- 
cgit v1.2.3


From b8ea49fa9baff99523aa51e680e76e8ccb5f25f3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 13 Oct 2010 23:14:07 +0900
Subject: jbd: Fix debug message in do_get_write_access()

'buffer_head' should be 'journal_head'.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c9bb7a73f8fd..846a3f314111 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -526,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	transaction = handle->h_transaction;
 	journal = transaction->t_journal;
 
-	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 
 	JBUFFER_TRACE(jh, "entry");
 repeat:
-- 
cgit v1.2.3


From bfa01dfbe0a02f5ba434221c351f0510c06af392 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Fri, 15 Oct 2010 20:03:02 +0900
Subject: ext3: Remove misplaced BUFFER_TRACE() in ext3_truncate()

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..90a9c3d7dadf 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2530,7 +2530,6 @@ void ext3_truncate(struct inode *inode)
 			 */
 		} else {
 			/* Shared branch grows from an indirect block */
-			BUFFER_TRACE(partial->bh, "get_write_access");
 			ext3_free_branches(handle, inode, partial->bh,
 					partial->p,
 					partial->p+1, (chain+n-1) - partial);
-- 
cgit v1.2.3


From e4d5e3a497e159be7c2dbe4c61cfb185d60cfde2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sat, 16 Oct 2010 17:11:02 +0900
Subject: jbd: Convert atomic_inc() to get_bh()

Convert atomic_inc(&bh->b_count) to get_bh(bh) for consistency.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/checkpoint.c | 4 ++--
 fs/jbd/commit.c     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05a38b9c4c0e..e4b87bc1fa56 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
 			goto restart;
 		}
 		if (buffer_locked(bh)) {
-			atomic_inc(&bh->b_count);
+			get_bh(bh);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
@@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 	int ret = 0;
 
 	if (buffer_locked(bh)) {
-		atomic_inc(&bh->b_count);
+		get_bh(bh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index c8428323167f..a89c4630b1ed 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -611,7 +611,7 @@ void journal_commit_transaction(journal_t *journal)
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-		atomic_inc(&jh2bh(jh)->b_count);
+		get_bh(jh2bh(jh));
 
 		/* Make a temporary IO buffer with which to write it out
                    (this will requeue both the metadata buffer and the
-- 
cgit v1.2.3


From db50d20b1da6ebb62bc28172de142efd9e7b5d89 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 17 Oct 2010 01:11:23 +0900
Subject: ext3: Fix debug messages in ext3_group_extend()

Fix a typo, break long lines and use E3FSBLK on ext3_fsblk_t.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/resize.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0ccd7b12b73c..e746d30b1232 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -977,7 +977,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	o_blocks_count = le32_to_cpu(es->s_blocks_count);
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+		       " upto "E3FSBLK" blocks\n",
 		       o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -985,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
 		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-			" too large to resize to %lu blocks safely\n",
+			" too large to resize to "E3FSBLK" blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
 			ext3_warning(sb, __func__,
@@ -1065,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
-		   o_blocks_count + add);
+	ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
-		   o_blocks_count + add);
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
 	if (test_opt(sb, DEBUG))
-- 
cgit v1.2.3


From 58c6ed38a16cd235588ed4a5ca3b213f055bc14c Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 16 Oct 2010 15:19:11 +0200
Subject: ext2: fixed typo.

"excpet"

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..f834f0d8d563 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
  *	the same format as ext2_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
- *	picture as after the successful ext2_get_block(), excpet that in one
+ *	picture as after the successful ext2_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
-- 
cgit v1.2.3


From bcf3d0bcff9c2ab5794d9a2c5f0ebc9ce63cc63f Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 16 Oct 2010 15:19:14 +0200
Subject: jbd/2: fixed typos

"wakup"

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c  | 2 +-
 fs/jbd2/journal.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index ac1840415a65..da1b5e4ffce1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -441,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
 	 */
 	if (!tid_geq(journal->j_commit_request, target)) {
 		/*
-		 * We want a new commit: OK, mark the request and wakup the
+		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
 		 */
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..2b9a342551b9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -478,7 +478,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 	 */
 	if (!tid_geq(journal->j_commit_request, target)) {
 		/*
-		 * We want a new commit: OK, mark the request and wakup the
+		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
 		 */
 
-- 
cgit v1.2.3


From a4c18ad2eed93194a667cb9f6662c3b3e8f0bba9 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 19 Oct 2010 00:34:35 +0900
Subject: ext3: Update kernel-doc comments

Update missing/broken argument descriptions and fix formatting.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 13 ++++++++-----
 fs/ext3/inode.c  | 19 ++++++++++++-------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a49ce4a7c23d..b3db22649426 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 
 /**
  * claim_block()
+ * @lock:		the spin lock for this block group
  * @block:		the free block (group relative) to allocate
- * @bh:			the bufferhead containts the block group bitmap
+ * @bh:			the buffer_head contains the block group bitmap
  *
  * We think we can allocate this block in this bitmap.  Try to set the bit.
  * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +957,11 @@ fail_access:
  *		but we will shift to the place where start_block is,
  *		then start from there, when looking for a reservable space.
  *
- *	@size: the target new reservation window size
+ *	@my_rsv: the reservation window
  *
- *	@group_first_block: the first block we consider to start
+ *	@sb: the super block
+ *
+ *	@start_block: the first block we consider to start
  *			the real search from
  *
  *	@last_block:
@@ -1084,7 +1087,7 @@ static int find_next_reservable_window(
  *
  *	failed: we failed to find a reservation window in this group
  *
- *	@rsv: the reservation
+ *	@my_rsv: the reservation window
  *
  *	@grp_goal: The goal (group-relative).  It is where the search for a
  *		free reservable space should start from.
@@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
  * @group:		given allocation block group
  * @bitmap_bh:		bufferhead holds the block bitmap
  * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
  * @my_rsv:		reservation window
+ * @count:		target number of blocks to allocate
  * @errp:		pointer to store the error code
  *
  * This is the main function used to allocate a new block and its reservation
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 90a9c3d7dadf..ef1c23a225f7 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 }
 
 /**
- *	ext3_blks_to_allocate: Look up the block map and count the number
+ *	ext3_blks_to_allocate - Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
@@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 }
 
 /**
- *	ext3_alloc_blocks: multiple allocate blocks needed for a branch
+ *	ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@goal: preferred place for allocation
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
- *
+ *	@blks:	number of blocks need to allocated for direct blocks
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
- *	@blks:	on return it will store the total number of allocated
- *		direct blocks
+ *	@err: here we store the error value
+ *
+ *	return the number of direct blocks allocated
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -598,9 +602,11 @@ failed_out:
 
 /**
  *	ext3_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
@@ -700,10 +706,9 @@ failed:
 
 /**
  * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
  * @inode: owner
  * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *	ext3_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
-- 
cgit v1.2.3


From 4408ea41c0ab4b711d4da44dd954fb06dce6c3f8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 19 Oct 2010 00:24:21 +0200
Subject: quota: Fix possible oops in __dquot_initialize()

When quotaon(8) races with __dquot_initialize() or dqget() fails because
of EIO, ENOSPC, or similar error, we could possibly dereference NULL pointer
in inode->i_dquot[cnt]. Add proper checking.

Reported-by: Dmitry Monakhov <dmonakhov@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 1bc38f56fa7c..0fed41e6efcd 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1386,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type)
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
+		/* We could race with quotaon or dqget() could have failed */
+		if (!got[cnt])
+			continue;
 		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
 			got[cnt] = NULL;
-- 
cgit v1.2.3


From 6b03590412c977ae8fa1635c9b80854ab19a5b78 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 27 Oct 2010 23:19:32 +0200
Subject: cifs: add kfree() on error path

We leak 256 bytes here on this error path.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d7c212a38386..398a15a99a1b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1362,6 +1362,7 @@ static int cifs_writepages(struct address_space *mapping,
 	if (!experimEnabled && tcon->ses->server->secMode &
 			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
 		cifsFileInfo_put(open_file);
+		kfree(iov);
 		return generic_writepages(mapping, wbc);
 	}
 	cifsFileInfo_put(open_file);
-- 
cgit v1.2.3


From e45c9effed903ba3fdbd6ef0498ee8989c35af0a Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Wed, 27 Oct 2010 15:33:30 -0700
Subject: isofs: work-around for Rock Ridge+Joliet CDs with empty ISO root
 directory

If a CD has both Rock Ridge and Joliet extensions and the ISO root
directory is empty, no files are visible.  Disable Rock Ridge extensions
in this case and use Joliet root directory instead.

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Guenter Roeck <guenter.roeck@ericsson.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/isofs/inode.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 60c2b944d762..79cf7f616bbe 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -543,6 +543,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
 	return vol_desc_start;
 }
 
+/*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+	int offset = 0, files = 0, de_len;
+	struct iso_directory_record *de;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, block);
+	if (!bh)
+		return true;
+	while (files < 3) {
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+		de_len = *(unsigned char *) de;
+		if (de_len == 0)
+			break;
+		files++;
+		offset += de_len;
+	}
+	brelse(bh);
+	return files < 3;
+}
+
 /*
  * Initialize the superblock and read the root inode.
  *
@@ -842,6 +870,18 @@ root_found:
 	if (IS_ERR(inode))
 		goto out_no_root;
 
+	/*
+	 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+	 * correct Joliet root directory.
+	 */
+	if (sbi->s_rock == 1 && joliet_level &&
+				rootdir_empty(s, sbi->s_firstdatazone)) {
+		printk(KERN_NOTICE
+			"ISOFS: primary root directory is empty. "
+			"Disabling Rock Ridge and switching to Joliet.");
+		sbi->s_rock = 0;
+	}
+
 	/*
 	 * If this disk has both Rock Ridge and Joliet on it, then we
 	 * want to use Rock Ridge by default.  This can be overridden
-- 
cgit v1.2.3


From 9b1bf12d5d51bca178dea21b04a0805e29d60cf1 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 27 Oct 2010 15:34:08 -0700
Subject: signals: move cred_guard_mutex from task_struct to signal_struct

Oleg Nesterov pointed out we have to prevent multiple-threads-inside-exec
itself and we can reuse ->cred_guard_mutex for it.  Yes, concurrent
execve() has no worth.

Let's move ->cred_guard_mutex from task_struct to signal_struct.  It
naturally prevent multiple-threads-inside-exec.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                 | 10 +++++-----
 fs/proc/base.c            |  8 ++++----
 include/linux/init_task.h |  4 ++--
 include/linux/sched.h     |  7 ++++---
 include/linux/tracehook.h |  2 +-
 kernel/cred.c             |  4 +---
 kernel/fork.c             |  2 ++
 kernel/ptrace.c           |  4 ++--
 8 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3aa75b8888a1..9722909c4d88 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1083,14 +1083,14 @@ EXPORT_SYMBOL(setup_new_exec);
  */
 int prepare_bprm_creds(struct linux_binprm *bprm)
 {
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
 		return -ERESTARTNOINTR;
 
 	bprm->cred = prepare_exec_creds();
 	if (likely(bprm->cred))
 		return 0;
 
-	mutex_unlock(&current->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 	return -ENOMEM;
 }
 
@@ -1098,7 +1098,7 @@ void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
-		mutex_unlock(&current->cred_guard_mutex);
+		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
 	kfree(bprm);
@@ -1119,13 +1119,13 @@ void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
-	mutex_unlock(&current->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9b094c1c8465..f3d02ca461ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm;
 
-	if (mutex_lock_killable(&task->cred_guard_mutex))
+	if (mutex_lock_killable(&task->signal->cred_guard_mutex))
 		return NULL;
 
 	mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		mmput(mm);
 		mm = NULL;
 	}
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 
 	return mm;
 }
@@ -2354,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 		goto out_free;
 
 	/* Guard against adverse ptrace interaction */
-	length = mutex_lock_interruptible(&task->cred_guard_mutex);
+	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 	if (length < 0)
 		goto out_free;
 
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
 				      (void*)page, count);
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
 	free_page((unsigned long) page);
 out:
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2fea6c8ef6ba..1f8c06ce0fa6 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -29,6 +29,8 @@ extern struct fs_struct init_fs;
 		.running = 0,						\
 		.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
 	},								\
+	.cred_guard_mutex =						\
+		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
 }
 
 extern struct nsproxy init_nsproxy;
@@ -145,8 +147,6 @@ extern struct cred init_cred;
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
-	.cred_guard_mutex =						\
-		 __MUTEX_INITIALIZER(tsk.cred_guard_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ff5c8519abd..be7adb7588e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -626,6 +626,10 @@ struct signal_struct {
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	int oom_score_adj;	/* OOM kill score adjustment */
+
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
@@ -1305,9 +1309,6 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	struct mutex cred_guard_mutex;	/* guard against foreign influences on
-					 * credential calculations
-					 * (notably. ptrace) */
 	struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 10db0102a890..3a2e66d88a32 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -150,7 +150,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
  *
  * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
  *
- * @task->cred_guard_mutex is held by the caller through the do_execve().
+ * @task->signal->cred_guard_mutex is held by the caller through the do_execve().
  */
 static inline int tracehook_unsafe_exec(struct task_struct *task)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
 
 /*
  * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
  */
 struct cred *prepare_exec_creds(void)
 {
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
-	mutex_init(&p->cred_guard_mutex);
-
 	if (
 #ifdef CONFIG_KEYS
 		!p->cred->thread_keyring &&
diff --git a/kernel/fork.c b/kernel/fork.c
index e87aaaaf5131..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
 
+	mutex_init(&sig->cred_guard_mutex);
+
 	return 0;
 }
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ea7ce0215cd1..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
 	 * under ptrace.
 	 */
 	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&task->cred_guard_mutex))
+	if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
 		goto out;
 
 	task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
 unlock_tasklist:
 	write_unlock_irq(&tasklist_lock);
 unlock_creds:
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From 1b0d300bd0f047e2edaf9d4b6784189e6c67c3d1 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Wed, 27 Oct 2010 15:34:08 -0700
Subject: core_pattern: fix truncation by core_pattern handler with long
 parameters

We met a parameter truncated issue, consider following:
> echo "|/root/core_pattern_pipe_test %p /usr/libexec/blah-blah-blah \
%s %c %p %u %g 11 12345678901234567890123456789012345678 %t" > \
/proc/sys/kernel/core_pattern

This is okay because the strings is less than CORENAME_MAX_SIZE.  "cat
/proc/sys/kernel/core_pattern" shows the whole string.  but after we run
core_pattern_pipe_test in man page, we found last parameter was truncated
like below:

        argc[10]=<12807486>

The root cause is core_pattern allows % specifiers, which need to be
replaced during parse time, but the replace may expand the strings to
larger than CORENAME_MAX_SIZE.  So if the last parameter is % specifiers,
the replace code is using snprintf(out_ptr, out_end - out_ptr, ...), this
will write out of corename array.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 155 ++++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 95 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 9722909c4d88..ca01d2d0a6d4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 int suid_dumpable = 0;
 
+struct core_name {
+	char *corename;
+	int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
 static LIST_HEAD(formats);
@@ -1459,127 +1465,148 @@ void set_binfmt(struct linux_binfmt *new)
 
 EXPORT_SYMBOL(set_binfmt);
 
+static int expand_corename(struct core_name *cn)
+{
+	char *old_corename = cn->corename;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+	if (!cn->corename) {
+		kfree(old_corename);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	char *cur;
+	int need;
+	int ret;
+	va_list arg;
+
+	va_start(arg, fmt);
+	need = vsnprintf(NULL, 0, fmt, arg);
+	va_end(arg);
+
+	if (likely(need < cn->size - cn->used - 1))
+		goto out_printf;
+
+	ret = expand_corename(cn);
+	if (ret)
+		goto expand_fail;
+
+out_printf:
+	cur = cn->corename + cn->used;
+	va_start(arg, fmt);
+	vsnprintf(cur, need + 1, fmt, arg);
+	va_end(arg);
+	cn->used += need;
+	return 0;
+
+expand_fail:
+	return ret;
+}
+
 /* format_corename will inspect the pattern parameter, and output a
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(char *corename, long signr)
+static int format_corename(struct core_name *cn, long signr)
 {
 	const struct cred *cred = current_cred();
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
-	char *out_ptr = corename;
-	char *const out_end = corename + CORENAME_MAX_SIZE;
-	int rc;
 	int pid_in_pattern = 0;
+	int err = 0;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+	cn->corename = kmalloc(cn->size, GFP_KERNEL);
+	cn->used = 0;
+
+	if (!cn->corename)
+		return -ENOMEM;
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
 	while (*pat_ptr) {
 		if (*pat_ptr != '%') {
-			if (out_ptr == out_end)
+			if (*pat_ptr == 0)
 				goto out;
-			*out_ptr++ = *pat_ptr++;
+			err = cn_printf(cn, "%c", *pat_ptr++);
 		} else {
 			switch (*++pat_ptr) {
+			/* single % at the end, drop that */
 			case 0:
 				goto out;
 			/* Double percent, output one percent */
 			case '%':
-				if (out_ptr == out_end)
-					goto out;
-				*out_ptr++ = '%';
+				err = cn_printf(cn, "%c", '%');
 				break;
 			/* pid */
 			case 'p':
 				pid_in_pattern = 1;
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", task_tgid_vnr(current));
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%d",
+					      task_tgid_vnr(current));
 				break;
 			/* uid */
 			case 'u':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", cred->uid);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%d", cred->uid);
 				break;
 			/* gid */
 			case 'g':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", cred->gid);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%d", cred->gid);
 				break;
 			/* signal that caused the coredump */
 			case 's':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%ld", signr);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%ld", signr);
 				break;
 			/* UNIX time of coredump */
 			case 't': {
 				struct timeval tv;
 				do_gettimeofday(&tv);
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%lu", tv.tv_sec);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%lu", tv.tv_sec);
 				break;
 			}
 			/* hostname */
 			case 'h':
 				down_read(&uts_sem);
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%s", utsname()->nodename);
+				err = cn_printf(cn, "%s",
+					      utsname()->nodename);
 				up_read(&uts_sem);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
 				break;
 			/* executable */
 			case 'e':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%s", current->comm);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%s", current->comm);
 				break;
 			/* core limit size */
 			case 'c':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%lu", rlimit(RLIMIT_CORE));
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%lu",
+					      rlimit(RLIMIT_CORE));
 				break;
 			default:
 				break;
 			}
 			++pat_ptr;
 		}
+
+		if (err)
+			return err;
 	}
+
 	/* Backward compatibility with core_uses_pid:
 	 *
 	 * If core_pattern does not include a %p (as is the default)
 	 * and core_uses_pid is set, then .%pid will be appended to
 	 * the filename. Do not do this for piped commands. */
 	if (!ispipe && !pid_in_pattern && core_uses_pid) {
-		rc = snprintf(out_ptr, out_end - out_ptr,
-			      ".%d", task_tgid_vnr(current));
-		if (rc > out_end - out_ptr)
-			goto out;
-		out_ptr += rc;
+		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+		if (err)
+			return err;
 	}
 out:
-	*out_ptr = 0;
 	return ispipe;
 }
 
@@ -1856,7 +1883,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
-	char corename[CORENAME_MAX_SIZE + 1];
+	struct core_name cn;
 	struct mm_struct *mm = current->mm;
 	struct linux_binfmt * binfmt;
 	const struct cred *old_cred;
@@ -1911,7 +1938,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	 */
 	clear_thread_flag(TIF_SIGPENDING);
 
-	ispipe = format_corename(corename, signr);
+	ispipe = format_corename(&cn, signr);
+
+	if (ispipe == -ENOMEM) {
+		printk(KERN_WARNING "format_corename failed\n");
+		printk(KERN_WARNING "Aborting core\n");
+		goto fail_corename;
+	}
 
  	if (ispipe) {
 		int dump_count;
@@ -1948,7 +1981,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
+		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
@@ -1961,7 +1994,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 		argv_free(helper_argv);
 		if (retval) {
  			printk(KERN_INFO "Core dump to %s pipe failed\n",
-			       corename);
+			       cn.corename);
 			goto close_fail;
  		}
 	} else {
@@ -1970,7 +2003,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 		if (cprm.limit < binfmt->min_coredump)
 			goto fail_unlock;
 
-		cprm.file = filp_open(corename,
+		cprm.file = filp_open(cn.corename,
 				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
 				 0600);
 		if (IS_ERR(cprm.file))
@@ -2012,6 +2045,8 @@ fail_dropcount:
 	if (ispipe)
 		atomic_dec(&core_dump_count);
 fail_unlock:
+	kfree(cn.corename);
+fail_corename:
 	coredump_finish(mm);
 	revert_creds(old_cred);
 fail_creds:
-- 
cgit v1.2.3


From 895021552d6ffe8a4d076cb5c4b1e700c33e96e1 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 27 Oct 2010 15:34:09 -0700
Subject: coredump: default CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y

The userland ELF tools have been coping with partial-segments core files
for a few years now.  Multiple distro builds are now setting this option.
It behooves everyone who ever deals with core files to have more info
dumped in there, especially as more and more people's compilers are
producing build IDs.  Make it the default.

Anyone using older tools confused by these core files can configure this
option off, or just change /proc/PID/coredump_filter after boot.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig.binfmt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 
 config CORE_DUMP_DEFAULT_ELF_HEADERS
 	bool "Write ELF core dumps with partial segments"
-	default n
+	default y
 	depends on BINFMT_ELF && ELF_CORE
 	help
 	  ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
 	  inherited.  See Documentation/filesystems/proc.txt for details.
 
 	  This config option changes the default setting of coredump_filter
-	  seen at boot time.  If unsure, say N.
+	  seen at boot time.  If unsure, say Y.
 
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
-- 
cgit v1.2.3


From b40d4f84becd69275451baee7f0801c85eb58437 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 27 Oct 2010 15:34:10 -0700
Subject: /proc/pid/smaps: export amount of anonymous memory in a mapping

Export the number of anonymous pages in a mapping via smaps.

Even the private pages in a mapping backed by a file, would be marked as
anonymous, when they are modified. Export this information to user-space via
smaps.

Exporting this count will help gdb to make a better decision on which
areas need to be dumped in its coredump; and should be useful to others
studying the memory usage of a process.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 13 ++++++++++---
 fs/proc/task_mmu.c                 |  6 ++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a563b74c7aef..976de6e19dd8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -370,6 +370,7 @@ Shared_Dirty:          0 kB
 Private_Clean:         0 kB
 Private_Dirty:         0 kB
 Referenced:          892 kB
+Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
@@ -378,9 +379,15 @@ The first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
 (size), the amount of the mapping that is currently resident in RAM (RSS), the
 process' proportional share of this mapping (PSS), the number of clean and
-dirty shared pages in the mapping, and the number of clean and dirty private
-pages in the mapping.  The "Referenced" indicates the amount of memory
-currently marked as referenced or accessed.
+dirty private pages in the mapping.  Note that even a page which is part of a
+MAP_SHARED mapping, but has only a single pte mapped, i.e.  is currently used
+by only one process, is accounted as private and not as shared.  "Referenced"
+indicates the amount of memory currently marked as referenced or accessed.
+"Anonymous" shows the amount of memory that does not belong to any file.  Even
+a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
+and a page is modified, the file page is replaced by a private anonymous copy.
+"Swap" shows how much would-be-anonymous memory is also used, but out on
+swap.
 
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 871e25ed0069..da6b01d70f01 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,6 +327,7 @@ struct mem_size_stats {
 	unsigned long private_clean;
 	unsigned long private_dirty;
 	unsigned long referenced;
+	unsigned long anonymous;
 	unsigned long swap;
 	u64 pss;
 };
@@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		if (!page)
 			continue;
 
+		if (PageAnon(page))
+			mss->anonymous += PAGE_SIZE;
+
 		mss->resident += PAGE_SIZE;
 		/* Accumulate the size in pages that have been accessed. */
 		if (pte_young(ptent) || PageReferenced(page))
@@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Clean:  %8lu kB\n"
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
 		   "MMUPageSize:    %8lu kB\n",
@@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_clean >> 10,
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
+		   mss.anonymous >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
 		   vma_mmu_pagesize(vma) >> 10);
-- 
cgit v1.2.3


From 19cd56c48da58bebc3a638e036bcab69469acd27 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Wed, 27 Oct 2010 15:34:12 -0700
Subject: procfs: fix /proc/softirqs formatting

The length of the BLOCK_IPOLL string is making i's value be printed too
far to the right.  This patch fixes this and makes the output a bit
neater.

Currently:
                CPU0
      HI:          0
   TIMER:     599792
  NET_TX:          2
  NET_RX:          6
   BLOCK:      80807
BLOCK_IOPOLL:          0
 TASKLET:      20012
   SCHED:          0
 HRTIMER:         63
     RCU:     619279

With patch:
                    CPU0
          HI:          0
       TIMER:     585582
      NET_TX:          2
      NET_RX:          6
       BLOCK:      80320
BLOCK_IOPOLL:          0
     TASKLET:      19287
       SCHED:          0
     HRTIMER:         62
         RCU:     604441

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Acked-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/softirqs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..37994737c983 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
 	int i, j;
 
-	seq_printf(p, "                ");
+	seq_printf(p, "                    ");
 	for_each_possible_cpu(i)
 		seq_printf(p, "CPU%-8d", i);
 	seq_printf(p, "\n");
 
 	for (i = 0; i < NR_SOFTIRQS; i++) {
-		seq_printf(p, "%8s:", softirq_to_name[i]);
+		seq_printf(p, "%12s:", softirq_to_name[i]);
 		for_each_possible_cpu(j)
 			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
 		seq_printf(p, "\n");
-- 
cgit v1.2.3


From f2c66cd8eeddedb440f33bc0f5cec1ed7ae376cb Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 27 Oct 2010 15:34:13 -0700
Subject: /proc/stat: scalability of irq num per cpu

/proc/stat shows the total number of all interrupts to each cpu.  But when
the number of IRQs are very large, it take very long time and 'cat
/proc/stat' takes more than 10 secs.  This is because sum of all irq
events are counted when /proc/stat is read.  This patch adds "sum of all
irq" counter percpu and reduce read costs.

The cost of reading /proc/stat is important because it's used by major
applications as 'top', 'ps', 'w', etc....

A test on a mechin (4096cpu, 256 nodes, 4592 irqs) shows

 %time cat /proc/stat > /dev/null
 Before Patch:  12.627 sec
 After  Patch:  2.459 sec

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Jack Steiner <steiner@sgi.com>
Acked-by: Jack Steiner <steiner@sgi.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/stat.c              |  4 +---
 include/linux/kernel_stat.h | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..b80c620565bf 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -52,9 +52,7 @@ static int show_stat(struct seq_file *p, void *v)
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		guest_nice = cputime64_add(guest_nice,
 			kstat_cpu(i).cpustat.guest_nice);
-		for_each_irq_nr(j) {
-			sum += kstat_irqs_cpu(j, i);
-		}
+		sum += kstat_cpu_irqs_sum(i);
 		sum += arch_irq_stat_cpu(i);
 
 		for (j = 0; j < NR_SOFTIRQS; j++) {
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index c059044bc6dc..8b9b89085530 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -33,6 +33,7 @@ struct kernel_stat {
 #ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
 #endif
+	unsigned long irqs_sum;
 	unsigned int softirqs[NR_SOFTIRQS];
 };
 
@@ -54,6 +55,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 					    struct irq_desc *desc)
 {
 	kstat_this_cpu.irqs[irq]++;
+	kstat_this_cpu.irqs_sum++;
 }
 
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,8 +67,9 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()]++)
+#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\
+	((DESC)->kstat_irqs[smp_processor_id()]++);\
+	kstat_this_cpu.irqs_sum++; } while (0)
 
 #endif
 
@@ -94,6 +97,13 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 	return sum;
 }
 
+/*
+ * Number of interrupts per cpu, since bootup
+ */
+static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
+{
+	return kstat_cpu(cpu).irqs_sum;
+}
 
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
-- 
cgit v1.2.3


From 478735e38887077ac77a9756121b6ce0cb956e2f Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 27 Oct 2010 15:34:15 -0700
Subject: /proc/stat: fix scalability of irq sum of all cpu

In /proc/stat, the number of per-IRQ event is shown by making a sum each
irq's events on all cpus.  But we can make use of kstat_irqs().

kstat_irqs() do the same calculation, If !CONFIG_GENERIC_HARDIRQ,
it's not a big cost. (Both of the number of cpus and irqs are small.)

If a system is very big and CONFIG_GENERIC_HARDIRQ, it does

	for_each_irq()
		for_each_cpu()
			- look up a radix tree
			- read desc->irq_stat[cpu]
This seems not efficient. This patch adds kstat_irqs() for
CONFIG_GENRIC_HARDIRQ and change the calculation as

	for_each_irq()
		look up radix tree
		for_each_cpu()
			- read desc->irq_stat[cpu]

This reduces cost.

A test on (4096cpusp, 256 nodes, 4592 irqs) host (by Jack Steiner)

%time cat /proc/stat > /dev/null

Before Patch:	 2.459 sec
After Patch :	  .561 sec

[akpm@linux-foundation.org: unexport kstat_irqs, coding-style tweaks]
[akpm@linux-foundation.org: fix unused variable 'per_irq_sum']
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Jack Steiner <steiner@sgi.com>
Acked-by: Jack Steiner <steiner@sgi.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/stat.c              | 10 ++--------
 include/linux/kernel_stat.h |  4 ++++
 kernel/irq/irqdesc.c        | 15 +++++++++++++++
 3 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b80c620565bf..e15a19c93bae 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
-	unsigned int per_irq_sum;
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -108,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
-		per_irq_sum = 0;
-		for_each_possible_cpu(i)
-			per_irq_sum += kstat_irqs_cpu(j, i);
-
-		seq_printf(p, " %u", per_irq_sum);
-	}
+	for_each_irq_nr(j)
+		seq_printf(p, " %u", kstat_irqs(j));
 
 	seq_printf(p,
 		"\nctxt %llu\n"
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 8b9b89085530..ad54c846911b 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -86,6 +86,7 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
+#ifndef CONFIG_GENERIC_HARDIRQS
 static inline unsigned int kstat_irqs(unsigned int irq)
 {
 	unsigned int sum = 0;
@@ -96,6 +97,9 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 
 	return sum;
 }
+#else
+extern unsigned int kstat_irqs(unsigned int irq);
+#endif
 
 /*
  * Number of interrupts per cpu, since bootup
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 	struct irq_desc *desc = irq_to_desc(irq);
 	return desc ? desc->kstat_irqs[cpu] : 0;
 }
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+unsigned int kstat_irqs(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	int cpu;
+	int sum = 0;
+
+	if (!desc)
+		return 0;
+	for_each_possible_cpu(cpu)
+		sum += desc->kstat_irqs[cpu];
+	return sum;
+}
+#endif /* CONFIG_GENERIC_HARDIRQS */
-- 
cgit v1.2.3


From 98391cf4dcf893e9e74e1c14189851dbc9c5ad0d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 27 Oct 2010 15:34:16 -0700
Subject: exec: don't turn PF_KTHREAD off when a target command was not found

Presently do_execve() turns PF_KTHREAD off before search_binary_handler().
 THis has a theorical risk of PF_KTHREAD getting lost.  We don't have to
turn PF_KTHREAD off in the ENOEXEC case.

This patch moves this flag modification to after the finding of the
executable file.

This is only a theorical issue because kthreads do not call do_execve()
directly.  But fixing would be better.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index ca01d2d0a6d4..99d33a1371e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1009,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	bprm->mm = NULL;		/* We're using it now */
 
-	current->flags &= ~PF_RANDOMIZE;
+	current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
 
@@ -1412,7 +1412,6 @@ int do_execve(const char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
 	if (retval < 0)
 		goto out;
-- 
cgit v1.2.3


From 0be8557bcd34887d5a42c01c5659cab5ecf99f13 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 27 Oct 2010 15:34:46 -0700
Subject: fuse: use release_pages()

Replace iterated page_cache_release() with release_pages(), which is
faster and shorter.

Needs release_pages() to be exported to modules.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c | 7 +------
 mm/swap.c     | 1 +
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b98664275f02..6e07696308dc 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1334,12 +1334,7 @@ out_finish:
 
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	int i;
-
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
-		page_cache_release(page);
-	}
+	release_pages(req->pages, req->num_pages, 0);
 }
 
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3f4854205b16 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
 
 	pagevec_free(&pages_to_free);
 }
+EXPORT_SYMBOL(release_pages);
 
 /*
  * The pages which we're about to release may be in the deferred lru-addition
-- 
cgit v1.2.3


From 231f3d393f63f6e3b505afa179999bba491d0f08 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 27 Oct 2010 15:34:53 -0700
Subject: select: rename estimate_accuracy() to select_estimate_accuracy()

Make it a subsystem-specific identifier because we wish to amke it
non-static in the next patch ("epoll: make epoll_wait() use the hrtimer
range feature").

Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..5f023f911202 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
 	return slack;
 }
 
-static long estimate_accuracy(struct timespec *tv)
+static long select_estimate_accuracy(struct timespec *tv)
 {
 	unsigned long ret;
 	struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	}
 
 	if (end_time && !timed_out)
-		slack = estimate_accuracy(end_time);
+		slack = select_estimate_accuracy(end_time);
 
 	retval = 0;
 	for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	}
 
 	if (end_time && !timed_out)
-		slack = estimate_accuracy(end_time);
+		slack = select_estimate_accuracy(end_time);
 
 	for (;;) {
 		struct poll_list *walk;
-- 
cgit v1.2.3


From 95aac7b1cd224f568fb83937044cd303ff11b029 Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <shawn.bohrer@gmail.com>
Date: Wed, 27 Oct 2010 15:34:54 -0700
Subject: epoll: make epoll_wait() use the hrtimer range feature

This make epoll use hrtimers for the timeout value which prevents
epoll_wait() from timing out up to a millisecond early.

This mirrors the behavior of select() and poll().

Signed-off-by: Shawn Bohrer <shawn.bohrer@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c       | 35 +++++++++++++++++++----------------
 fs/select.c          |  2 +-
 include/linux/poll.h |  2 ++
 3 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 256bb7bb102a..8cf07242067d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
 
-/* Maximum msec timeout value storeable in a long int */
-#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
-
 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 
 #define EP_UNACTIVE_PTR ((void *) -1L)
@@ -1117,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
-	int res, eavail;
+	int res, eavail, timed_out = 0;
 	unsigned long flags;
-	long jtimeout;
+	long slack;
 	wait_queue_t wait;
-
-	/*
-	 * Calculate the timeout by checking for the "infinite" value (-1)
-	 * and the overflow condition. The passed timeout is in milliseconds,
-	 * that why (t * HZ) / 1000.
-	 */
-	jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
-		MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
+	struct timespec end_time;
+	ktime_t expires, *to = NULL;
+
+	if (timeout > 0) {
+		ktime_get_ts(&end_time);
+		timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
+		slack = select_estimate_accuracy(&end_time);
+		to = &expires;
+		*to = timespec_to_ktime(end_time);
+	} else if (timeout == 0) {
+		timed_out = 1;
+	}
 
 retry:
 	spin_lock_irqsave(&ep->lock, flags);
@@ -1150,7 +1151,7 @@ retry:
 			 * to TASK_INTERRUPTIBLE before doing the checks.
 			 */
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (!list_empty(&ep->rdllist) || !jtimeout)
+			if (!list_empty(&ep->rdllist) || timed_out)
 				break;
 			if (signal_pending(current)) {
 				res = -EINTR;
@@ -1158,7 +1159,9 @@ retry:
 			}
 
 			spin_unlock_irqrestore(&ep->lock, flags);
-			jtimeout = schedule_timeout(jtimeout);
+			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+				timed_out = 1;
+
 			spin_lock_irqsave(&ep->lock, flags);
 		}
 		__remove_wait_queue(&ep->wq, &wait);
@@ -1176,7 +1179,7 @@ retry:
 	 * more luck.
 	 */
 	if (!res && eavail &&
-	    !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
+	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto retry;
 
 	return res;
diff --git a/fs/select.c b/fs/select.c
index 5f023f911202..b7b10aa30861 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
 	return slack;
 }
 
-static long select_estimate_accuracy(struct timespec *tv)
+long select_estimate_accuracy(struct timespec *tv)
 {
 	unsigned long ret;
 	struct timespec now;
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 600cc1fde64d..56e76af78102 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -73,6 +73,8 @@ extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
 extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 				 ktime_t *expires, unsigned long slack);
+extern long select_estimate_accuracy(struct timespec *tv);
+
 
 static inline int poll_schedule(struct poll_wqueues *pwq, int state)
 {
-- 
cgit v1.2.3


From 55f335a8857db2ee22c068e7ab7141fc79928296 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 27 Oct 2010 18:17:02 -0700
Subject: fasync: Fix placement of FASYNC flag comment

In commit f7347ce4ee7c ("fasync: re-organize fasync entry insertion to
allow it under a spinlock") Arnd took an earlier patch of mine that had
the comment about the FASYNC flag above the wrong function.

When the fasync_add_entry() function was split to introduce the new
fasync_insert_entry() helper function, the code that actually cares
about the FASYNC bit moved to that new helper.

So just move the comment to the right point.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fcntl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index dcdbc6f5c33b..ecc8b3954ed6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -684,6 +684,9 @@ void fasync_free(struct fasync_struct *new)
 /*
  * Insert a new entry into the fasync list.  Return the pointer to the
  * old one if we didn't use the new one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
  */
 struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
 {
@@ -718,9 +721,6 @@ out:
 /*
  * Add a fasync entry. Return negative on error, positive if
  * added, and zero if did nothing but change an existing one.
- *
- * NOTE! It is very important that the FASYNC flag always
- * match the state "is the filp on a fasync list".
  */
 static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
 {
-- 
cgit v1.2.3


From 58590b06d79f7ce5ab64ff3b6d537180fa50dc84 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:23:12 -0400
Subject: ext4: fix EOFBLOCKS_FL handling

It turns out we have several problems with how EOFBLOCKS_FL is
handled.  First of all, there was a fencepost error where we were not
clearing the EOFBLOCKS_FL when fill in the last uninitialized block,
but rather when we allocate the next block _after_ the uninitalized
block.  Secondly we were not testing to see if we needed to clear the
EOFBLOCKS_FL when writing to the file O_DIRECT or when were converting
an uninitialized block (which is the most common case).

Google-Bug-Id: 2928259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 98 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..820278410220 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3180,6 +3180,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                 unmap_underlying_metadata(bdev, block + i);
 }
 
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      struct ext4_map_blocks *map,
+			      struct ext4_ext_path *path,
+			      unsigned int len)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex, *last_ex;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	if (unlikely(!eh->eh_entries)) {
+		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+				 "EOFBLOCKS_FL set");
+		return -EIO;
+	}
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * We should clear the EOFBLOCKS_FL flag if we are writing the
+	 * last block in the last extent in the file.  We test this by
+	 * first checking to see if the caller to
+	 * ext4_ext_get_blocks() was interested in the last block (or
+	 * a block beyond the last block) in the current extent.  If
+	 * this turns out to be false, we can bail out from this
+	 * function immediately.
+	 */
+	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+	    ext4_ext_get_actual_len(last_ex))
+		return 0;
+	/*
+	 * If the caller does appear to be planning to write at or
+	 * beyond the end of the current extent, we then test to see
+	 * if the current extent is the last extent in the file, by
+	 * checking to make sure it was reached via the rightmost node
+	 * at each level of the tree.
+	 */
+	for (i = depth-1; i >= 0; i--)
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			return 0;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
@@ -3217,8 +3268,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
-		if (ret >= 0)
+		if (ret >= 0) {
 			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map, path,
+						 map->m_len);
+		} else
+			err = ret;
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3244,8 +3299,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* buffered write, writepage time, convert*/
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-	if (ret >= 0)
+	if (ret >= 0) {
 		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+		if (err < 0)
+			goto out2;
+	}
+
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3292,6 +3352,7 @@ out2:
 	}
 	return err ? err : allocated;
 }
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3315,9 +3376,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex, *last_ex;
+	struct ext4_extent newex, *ex;
 	ext4_fsblk_t newblock;
-	int i, err = 0, depth, ret, cache_type;
+	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3497,31 +3558,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-		if (unlikely(!eh->eh_entries)) {
-			EXT4_ERROR_INODE(inode,
-					 "eh->eh_entries == 0 and "
-					 "EOFBLOCKS_FL set");
-			err = -EIO;
-			goto out2;
-		}
-		last_ex = EXT_LAST_EXTENT(eh);
-		/*
-		 * If the current leaf block was reached by looking at
-		 * the last index block all the way down the tree, and
-		 * we are extending the inode beyond the last extent
-		 * in the current leaf block, then clear the
-		 * EOFBLOCKS_FL flag.
-		 */
-		for (i = depth-1; i >= 0; i--) {
-			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-				break;
-		}
-		if ((i < 0) &&
-		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-		     ext4_ext_get_actual_len(last_ex)))
-			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-	}
+	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+	if (err)
+		goto out2;
+
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
-- 
cgit v1.2.3


From 39e3ac2599a5f9aba499b5f8af809108e70a6163 Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 27 Oct 2010 21:25:12 -0400
Subject: jbd2: Fix I/O hang in jbd2_journal_release_jbd_inode

This fixes a hang seen in jbd2_journal_release_jbd_inode
on a lot of Power 6 systems running with ext4. When we get
in the hung state, all I/O to the disk in question gets blocked
where we stay indefinitely. Looking at the task list, I can see
we are stuck in jbd2_journal_release_jbd_inode waiting on a
wake up. I added some debug code to detect this scenario and
dump additional data if we were stuck in jbd2_journal_release_jbd_inode
for longer than 30 minutes. When it hit, I was able to see that
i_flags was 0, suggesting we missed the wake up.

This patch changes i_flags to be an unsigned long, uses bit operators
to access it, and adds barriers around the accesses. Prior to applying
this patch, we were regularly hitting this hang on numerous systems
in our test environment. After applying the patch, the hangs no longer
occur.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c     | 12 ++++++++----
 fs/jbd2/journal.c    |  4 +++-
 include/linux/jbd2.h |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..6494c81e3b0a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/bitops.h>
 #include <trace/events/jbd2.h>
+#include <asm/system.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -236,7 +238,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 		mapping = jinode->i_vfs_inode->i_mapping;
-		jinode->i_flags |= JI_COMMIT_RUNNING;
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		spin_unlock(&journal->j_list_lock);
 		/*
 		 * submit the inode data buffers. We use writepage
@@ -251,7 +253,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 		spin_lock(&journal->j_list_lock);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
 		commit_transaction->t_flushed_data_blocks = 1;
-		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_clear_bit();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -272,7 +275,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-		jinode->i_flags |= JI_COMMIT_RUNNING;
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 		if (err) {
@@ -288,7 +291,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 				ret = err;
 		}
 		spin_lock(&journal->j_list_lock);
-		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_clear_bit();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..75e1b5a0bc2d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -42,12 +42,14 @@
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
+#include <linux/bitops.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/system.h>
 
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
@@ -2206,7 +2208,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 restart:
 	spin_lock(&journal->j_list_lock);
 	/* Is commit writing out inode - we have to wait */
-	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+	if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
 		wait_queue_head_t *wq;
 		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
 		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 0b52924a0cb6..2ae86aa21fce 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -395,7 +395,7 @@ struct jbd2_inode {
 	struct inode *i_vfs_inode;
 
 	/* Flags of inode [j_list_lock] */
-	unsigned int i_flags;
+	unsigned long i_flags;
 };
 
 struct jbd2_revoke_table_s;
-- 
cgit v1.2.3


From fb1813f4a8a27bbd4735967e46931e61fc837a3e Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Wed, 27 Oct 2010 21:29:12 -0400
Subject: ext4: use dedicated slab caches for group_info structures

ext4_group_info structures are currently allocated with kmalloc().
With a typical 4K block size, these are 136 bytes each -- meaning
they'll each consume a 256-byte slab object.  On a system with many
ext4 large partitions, that's a lot of wasted kernel slab space.
(E.g., a single 1TB partition will have about 8000 block groups, using
about 2MB of slab, of which nearly 1MB is wasted.)

This patch creates an array of slab pointers created as needed --
depending on the superblock block size -- and uses these slabs to
allocate the group info objects.

Google-Bug-Id: 2980809

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/mballoc.c | 98 +++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 78 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..b364b9df09b3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -205,6 +205,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE		1024
 #define	EXT4_MAX_BLOCK_SIZE		65536
 #define EXT4_MIN_BLOCK_LOG_SIZE		10
+#define EXT4_MAX_BLOCK_LOG_SIZE		16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
 #else
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..d4714d6cf7d9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES	\
+	(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2233,15 +2241,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
 	.release	= seq_release,
 };
 
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+	BUG_ON(!cachep);
+	return cachep;
+}
 
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			  struct ext4_group_desc *desc)
 {
-	int i, len;
+	int i;
 	int metalen = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_info **meta_group_info;
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
 	/*
 	 * First check if this group is the first of a reserved block.
@@ -2261,22 +2278,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			meta_group_info;
 	}
 
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = offsetof(typeof(**meta_group_info),
-		       bb_counters[sb->s_blocksize_bits + 2]);
-
 	meta_group_info =
 		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
 	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
-	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
 	if (meta_group_info[i] == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
 		goto exit_group_info;
 	}
+	memset(meta_group_info[i], 0, kmem_cache_size(cachep));
 	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
 		&(meta_group_info[i]->bb_state));
 
@@ -2331,6 +2342,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	int num_meta_group_infos_max;
 	int array_size;
 	struct ext4_group_desc *desc;
+	struct kmem_cache *cachep;
 
 	/* This is the number of blocks used by GDT */
 	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2388,8 +2400,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	return 0;
 
 err_freebuddy:
+	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 	while (i-- > 0)
-		kfree(ext4_get_group_info(sb, i));
+		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
 	i = num_meta_group_infos;
 	while (i-- > 0)
 		kfree(sbi->s_group_info[i]);
@@ -2406,19 +2419,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned offset;
 	unsigned max;
 	int ret;
+	int cache_index;
+	struct kmem_cache *cachep;
+	char *namep = NULL;
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		kfree(sbi->s_mb_offsets);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	cachep = ext4_groupinfo_caches[cache_index];
+	if (!cachep) {
+		char name[32];
+		int len = offsetof(struct ext4_group_info,
+					bb_counters[sb->s_blocksize_bits + 2]);
+
+		sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+		namep = kstrdup(name, GFP_KERNEL);
+		if (!namep) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/* Need to free the kmem_cache_name() when we
+		 * destroy the slab */
+		cachep = kmem_cache_create(namep, len, 0,
+					     SLAB_RECLAIM_ACCOUNT, NULL);
+		if (!cachep) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ext4_groupinfo_caches[cache_index] = cachep;
 	}
 
 	/* order 0 is regular bitmap */
@@ -2439,9 +2481,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	/* init file for buddy data */
 	ret = ext4_mb_init_backend(sb);
 	if (ret != 0) {
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-		return ret;
+		goto out;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2496,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
@@ -2475,7 +2514,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	if (sbi->s_journal)
 		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-	return 0;
+out:
+	if (ret) {
+		kfree(sbi->s_mb_offsets);
+		kfree(sbi->s_mb_maxs);
+		kfree(namep);
+	}
+	return ret;
 }
 
 /* need to called with the ext4 group lock held */
@@ -2503,6 +2548,7 @@ int ext4_mb_release(struct super_block *sb)
 	int num_meta_group_infos;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2559,7 @@ int ext4_mb_release(struct super_block *sb)
 			ext4_lock_group(sb, i);
 			ext4_mb_cleanup_pa(grinfo);
 			ext4_unlock_group(sb, i);
-			kfree(grinfo);
+			kmem_cache_free(cachep, grinfo);
 		}
 		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2691,6 +2737,7 @@ int __init init_ext4_mballoc(void)
 
 void exit_ext4_mballoc(void)
 {
+	int i;
 	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
@@ -2699,6 +2746,15 @@ void exit_ext4_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+		if (cachep) {
+			char *name = (char *)kmem_cache_name(cachep);
+			kmem_cache_destroy(cachep);
+			kfree(name);
+		}
+	}
 	ext4_remove_debugfs_entry();
 }
 
-- 
cgit v1.2.3


From 659c6009ca2e3a01acc9881bafe5f55ef09c965b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:03 -0400
Subject: ext4: stop looping in ext4_num_dirty_pages when max_pages reached

Today we simply break out of the inner loop when we have accumulated
max_pages; this keeps scanning forwad and doing pagevec_lookup_tag()
in the while (!done) loop, this does potentially a lot of work
with no net effect.

When we have accumulated max_pages, just clean up and return.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..d88ba4a9effa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1207,8 +1207,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 				break;
 			idx++;
 			num++;
-			if (num >= max_pages)
+			if (num >= max_pages) {
+				done = 1;
 				break;
+			}
 		}
 		pagevec_release(&pvec);
 	}
-- 
cgit v1.2.3


From b443e7339aa08574d30b0819b344618459c76214 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:03 -0400
Subject: ext4: don't bump up LONG_MAX nr_to_write by a factor of 8

I'm uneasy with lots of stuff going on in ext4_da_writepages(),
but bumping nr_to_write from LLONG_MAX to -8 clearly isn't
making anything better, so avoid the multiplier in that case.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d88ba4a9effa..50f3bba68a25 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3004,9 +3004,12 @@ static int ext4_da_writepages(struct address_space *mapping,
 	 * sbi->max_writeback_mb_bump whichever is smaller.
 	 */
 	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-	if (!range_cyclic && range_whole)
-		desired_nr_to_write = wbc->nr_to_write * 8;
-	else
+	if (!range_cyclic && range_whole) {
+		if (wbc->nr_to_write == LONG_MAX)
+			desired_nr_to_write = wbc->nr_to_write;
+		else
+			desired_nr_to_write = wbc->nr_to_write * 8;
+	} else
 		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
 							   max_pages);
 	if (desired_nr_to_write > max_pages)
-- 
cgit v1.2.3


From 582987098207f1182ed5c7d01d5fedf7a5f56286 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:03 -0400
Subject: ext4: check for negative error code from sb_issue_discard

sb_issue_discard() is returning negative error code, so check for
-EOPNOTSUPP.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d4714d6cf7d9..53472e27b327 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2613,7 +2613,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
 	ret = sb_issue_discard(sb, discard_block, count);
-	if (ret == EOPNOTSUPP) {
+	if (ret == -EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
 	}
-- 
cgit v1.2.3


From 53fdcf992d616484d388a8ab9dad07dc8b8f1178 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:04 -0400
Subject: ext4: don't hold spinlock while calling ext4_issue_discard()

We can't hold the block group spinlock because we ext4_issue_discard()
calls wait and hence can get rescheduled.

Google-Bug-Id: 3017678

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 53472e27b327..ccdfec6acb75 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4696,12 +4696,12 @@ do_more:
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
+		if (test_opt(sb, DISCARD))
+			ext4_issue_discard(sb, block_group, bit, count);
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, block_group, bit, count);
 	}
 
 	ret = ext4_free_blks_count(sb, gdp) + count;
-- 
cgit v1.2.3


From a1c6c5698d53db4c47a25c3a8d11731a4d7b8370 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Wed, 27 Oct 2010 21:30:04 -0400
Subject: ext4: fix NULL pointer dereference in print_daily_error_info()

Fix NULL pointer dereference in print_daily_error_info, when
called on unmounted fs (EXT4_SB(sb) returns NULL), by removing error
reporting timer in ext4_put_super.

Google-Bug-Id: 3017663

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..751997d2cefe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -719,6 +719,7 @@ static void ext4_put_super(struct super_block *sb)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
+	del_timer(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
-- 
cgit v1.2.3


From 5c2178e785244341d1e6f2bc3b62f20a337cc44f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:04 -0400
Subject: jbd2: Add sanity check for attempts to start handle during umount

An attempt to modify the file system during the call to
jbd2_destroy_journal() can lead to a system lockup.  So add some
checking to make it much more obvious when this happens to and to
determine where the offending code is located.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c  | 10 ++++++++++
 fs/jbd2/transaction.c |  1 +
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..524800dce207 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		transaction->t_chp_stats.cs_forced_to_close++;
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
+		if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+			/*
+			 * The journal thread is dead; so starting and
+			 * waiting for a commit to finish will cause
+			 * us to wait for a _very_ long time.
+			 */
+			printk(KERN_ERR "JBD2: %s: "
+			       "Waiting for Godot: block %llu\n",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr);
 		jbd2_log_start_commit(journal, tid);
 		jbd2_log_wait_commit(journal, tid);
 		ret = 1;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3479d6e0a83..6bf0a242613e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -156,6 +156,7 @@ alloc_transaction:
 	 */
 repeat:
 	read_lock(&journal->j_state_lock);
+	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
 		read_unlock(&journal->j_state_lock);
-- 
cgit v1.2.3


From bfff68738f1cb5c93dab1114634cea02aae9e7ba Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:05 -0400
Subject: ext4: add support for lazy inode table initialization

When the lazy_itable_init extended option is passed to mke2fs, it
considerably speeds up filesystem creation because inode tables are
not zeroed out.  The fact that parts of the inode table are
uninitialized is not a problem so long as the block group descriptors,
which contain information regarding how much of the inode table has
been initialized, has not been corrupted However, if the block group
checksums are not valid, e2fsck must scan the entire inode table, and
the the old, uninitialized data could potentially cause e2fsck to
report false problems.

Hence, it is important for the inode tables to be initialized as soon
as possble.  This commit adds this feature so that mke2fs can safely
use the lazy inode table initialization feature to speed up formatting
file systems.

This is done via a new new kernel thread called ext4lazyinit, which is
created on demand and destroyed, when it is no longer needed.  There
is only one thread for all ext4 filesystems in the system. When the
first filesystem with inititable mount option is mounted, ext4lazyinit
thread is created, then the filesystem can register its request in the
request list.

This thread then walks through the list of requests picking up
scheduled requests and invoking ext4_init_inode_table(). Next schedule
time for the request is computed by multiplying the time it took to
zero out last inode table with wait multiplier, which can be set with
the (init_itable=n) mount option (default is 10).  We are doing
this so we do not take the whole I/O bandwidth. When the thread is no
longer necessary (request list is empty) it frees the appropriate
structures and exits (and can be created later later by another
filesystem).

We do not disturb regular inode allocations in any way, it just do not
care whether the inode table is, or is not zeroed. But when zeroing, we
have to skip used inodes, obviously. Also we should prevent new inode
allocations from the group, while zeroing is on the way. For that we
take write alloc_sem lock in ext4_init_inode_table() and read alloc_sem
in the ext4_claim_inode, so when we are unlucky and allocator hits the
group which is currently being zeroed, it just has to wait.

This can be suppresed using the mount option no_init_itable.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |  14 ++
 fs/ext4/ext4.h                     |  40 ++++
 fs/ext4/ialloc.c                   | 120 ++++++++++
 fs/ext4/super.c                    | 440 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 611 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index e1def1786e50..6ab9442d7eeb 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -353,6 +353,20 @@ noauto_da_alloc		replacing existing files via patterns such as
 			system crashes before the delayed allocation
 			blocks are forced to disk.
 
+noinit_itable		Do not initialize any uninitialized inode table
+			blocks in the background.  This feature may be
+			used by installation CD's so that the install
+			process can complete as quickly as possible; the
+			inode table initialization process would then be
+			deferred until the next time the  file system
+			is unmounted.
+
+init_itable=n		The lazy itable init code will wait n times the
+			number of milliseconds it took to zero out the
+			previous block group's inode table.  This
+			minimizes the impact on the systme performance
+			while file system's inode table is being initialized.
+
 discard		Controls whether ext4 should issue discard/TRIM
 nodiscard(*)		commands to the underlying block device when
 			blocks are freed.  This is useful for SSD devices
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b364b9df09b3..0fe078d368d0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -890,6 +890,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
@@ -1173,6 +1174,11 @@ struct ext4_sb_info {
 
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
+
+	/* Lazy inode table initialization info */
+	struct ext4_li_request *s_li_request;
+	/* Wait multiplier for lazy initialization thread */
+	unsigned int s_li_wait_mult;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1536,6 +1542,38 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT			10
+#define EXT4_DEF_LI_MAX_START_DELAY		5
+#define EXT4_LAZYINIT_QUIT			0x0001
+#define EXT4_LAZYINIT_RUNNING			0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+	unsigned long		li_state;
+
+	wait_queue_head_t	li_wait_daemon;
+	wait_queue_head_t	li_wait_task;
+	struct timer_list	li_timer;
+	struct task_struct	*li_task;
+
+	struct list_head	li_request_list;
+	struct mutex		li_list_mtx;
+};
+
+struct ext4_li_request {
+	struct super_block	*lr_super;
+	struct ext4_sb_info	*lr_sbi;
+	ext4_group_t		lr_next_group;
+	struct list_head	lr_request;
+	unsigned long		lr_next_sched;
+	unsigned long		lr_timeout;
+};
+
 /*
  * Function prototypes
  */
@@ -1611,6 +1649,8 @@ extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
 				       ext4_group_t group,
 				       struct ext4_group_desc *desc);
 extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+				 ext4_group_t group, int barrier);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..e428f23215c0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -107,6 +107,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		return NULL;
+
 	bitmap_blk = ext4_inode_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
@@ -123,6 +124,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
+
 	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +135,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		return bh;
 	}
 	ext4_unlock_group(sb, block_group);
+
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -712,8 +715,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
 	int free = 0, retval = 0, count;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
+	/*
+	 * We have to be sure that new inode allocation does not race with
+	 * inode table initialization, because otherwise we may end up
+	 * allocating and writing new inode right before sb_issue_zeroout
+	 * takes place and overwriting our new inode with zeroes. So we
+	 * take alloc_sem to prevent it.
+	 */
+	down_read(&grp->alloc_sem);
 	ext4_lock_group(sb, group);
 	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
@@ -724,6 +736,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_unlock_group(sb, group);
+		up_read(&grp->alloc_sem);
 		ext4_error(sb, "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +785,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
 	ext4_unlock_group(sb, group);
+	up_read(&grp->alloc_sem);
 	return retval;
 }
 
@@ -1205,3 +1219,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 	}
 	return count;
 }
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+				 int barrier)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *group_desc_bh;
+	handle_t *handle;
+	ext4_fsblk_t blk;
+	int num, ret = 0, used_blks = 0;
+	unsigned long flags = BLKDEV_IFL_WAIT;
+
+	/* This should not happen, but just to be sure check this */
+	if (sb->s_flags & MS_RDONLY) {
+		ret = 1;
+		goto out;
+	}
+
+	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+	if (!gdp)
+		goto out;
+
+	/*
+	 * We do not need to lock this, because we are the only one
+	 * handling this flag.
+	 */
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+		goto out;
+
+	handle = ext4_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	down_write(&grp->alloc_sem);
+	/*
+	 * If inode bitmap was already initialized there may be some
+	 * used inodes so we need to skip blocks with used inodes in
+	 * inode table.
+	 */
+	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
+
+	blk = ext4_inode_table(sb, gdp) + used_blks;
+	num = sbi->s_itb_per_group - used_blks;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	ret = ext4_journal_get_write_access(handle,
+					    group_desc_bh);
+	if (ret)
+		goto err_out;
+
+	if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
+		ext4_error(sb, "Something is wrong with group %u\n"
+			   "Used itable blocks: %d"
+			   "Itable blocks per group: %lu\n",
+			   group, used_blks, sbi->s_itb_per_group);
+		ret = 1;
+		goto err_out;
+	}
+
+	/*
+	 * Skip zeroout if the inode table is full. But we set the ZEROED
+	 * flag anyway, because obviously, when it is full it does not need
+	 * further zeroing.
+	 */
+	if (unlikely(num == 0))
+		goto skip_zeroout;
+
+	ext4_debug("going to zero out inode table in group %d\n",
+		   group);
+	if (barrier)
+		flags |= BLKDEV_IFL_BARRIER;
+	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
+	if (ret < 0)
+		goto err_out;
+
+skip_zeroout:
+	ext4_lock_group(sb, group);
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh,
+		     "call ext4_handle_dirty_metadata");
+	ret = ext4_handle_dirty_metadata(handle, NULL,
+					 group_desc_bh);
+
+err_out:
+	up_write(&grp->alloc_sem);
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 751997d2cefe..5066537e5a38 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -41,6 +41,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
 
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -52,6 +55,8 @@
 
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -70,6 +75,8 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data, struct vfsmount *mnt);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -720,6 +727,7 @@ static void ext4_put_super(struct super_block *sb)
 	}
 
 	del_timer(&sbi->s_err_report);
+	ext4_unregister_li_request(sb);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
@@ -1046,6 +1054,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
 		seq_puts(seq, ",block_validity");
 
+	if (!test_opt(sb, INIT_INODE_TABLE))
+		seq_puts(seq, ",noinit_inode_table");
+	else if (sbi->s_li_wait_mult)
+		seq_printf(seq, ",init_inode_table=%u",
+			   (unsigned) sbi->s_li_wait_mult);
+
 	ext4_show_quota_options(seq, sb);
 
 	return 0;
@@ -1220,6 +1234,7 @@ enum {
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard,
+	Opt_init_inode_table, Opt_noinit_inode_table,
 };
 
 static const match_table_t tokens = {
@@ -1290,6 +1305,9 @@ static const match_table_t tokens = {
 	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_init_inode_table, "init_itable=%u"},
+	{Opt_init_inode_table, "init_itable"},
+	{Opt_noinit_inode_table, "noinit_itable"},
 	{Opt_err, NULL},
 };
 
@@ -1760,6 +1778,20 @@ set_qf_format:
 		case Opt_dioread_lock:
 			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
 			break;
+		case Opt_init_inode_table:
+			set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = EXT4_DEF_LI_WAIT_MULT;
+			if (option < 0)
+				return 0;
+			sbi->s_li_wait_mult = option;
+			break;
+		case Opt_noinit_inode_table:
+			clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -1943,7 +1975,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+				  ext4_group_t *first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1952,7 +1985,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	ext4_fsblk_t inode_bitmap;
 	ext4_fsblk_t inode_table;
 	int flexbg_flag = 0;
-	ext4_group_t i;
+	ext4_group_t i, grp = sbi->s_groups_count;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
@@ -1968,6 +2001,10 @@ static int ext4_check_descriptors(struct super_block *sb)
 			last_block = first_block +
 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
 
+		if ((grp == sbi->s_groups_count) &&
+		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			grp = i;
+
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2005,6 +2042,8 @@ static int ext4_check_descriptors(struct super_block *sb)
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
+	if (NULL != first_not_zeroed)
+		*first_not_zeroed = grp;
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
 	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2543,6 +2582,378 @@ static void print_daily_error_info(unsigned long arg)
 	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
 
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t group, ngroups;
+	struct super_block *sb;
+	unsigned long timeout = 0;
+	int ret = 0;
+
+	sb = elr->lr_super;
+	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			ret = 1;
+			break;
+		}
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	if (group == ngroups)
+		ret = 1;
+
+	if (!ret) {
+		timeout = jiffies;
+		ret = ext4_init_inode_table(sb, group,
+					    elr->lr_timeout ? 0 : 1);
+		if (elr->lr_timeout == 0) {
+			timeout = jiffies - timeout;
+			if (elr->lr_sbi->s_li_wait_mult)
+				timeout *= elr->lr_sbi->s_li_wait_mult;
+			else
+				timeout *= 20;
+			elr->lr_timeout = timeout;
+		}
+		elr->lr_next_sched = jiffies + elr->lr_timeout;
+		elr->lr_next_group = group + 1;
+	}
+
+	return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_sb_info *sbi;
+
+	if (!elr)
+		return;
+
+	sbi = elr->lr_sbi;
+
+	list_del(&elr->lr_request);
+	sbi->s_li_request = NULL;
+	kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+	struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+
+	if (!ext4_li_info)
+		return;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	ext4_remove_li_request(elr);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+	unsigned long next_wakeup;
+	DEFINE_WAIT(wait);
+	int ret;
+
+	BUG_ON(NULL == eli);
+
+	eli->li_timer.data = (unsigned long)current;
+	eli->li_timer.function = ext4_lazyinode_timeout;
+
+	eli->li_task = current;
+	wake_up(&eli->li_wait_task);
+
+cont_thread:
+	while (true) {
+		next_wakeup = MAX_JIFFY_OFFSET;
+
+		mutex_lock(&eli->li_list_mtx);
+		if (list_empty(&eli->li_request_list)) {
+			mutex_unlock(&eli->li_list_mtx);
+			goto exit_thread;
+		}
+
+		list_for_each_safe(pos, n, &eli->li_request_list) {
+			elr = list_entry(pos, struct ext4_li_request,
+					 lr_request);
+
+			if (time_after_eq(jiffies, elr->lr_next_sched))
+				ret = ext4_run_li_request(elr);
+
+			if (ret) {
+				ret = 0;
+				ext4_remove_li_request(elr);
+				continue;
+			}
+
+			if (time_before(elr->lr_next_sched, next_wakeup))
+				next_wakeup = elr->lr_next_sched;
+		}
+		mutex_unlock(&eli->li_list_mtx);
+
+		if (freezing(current))
+			refrigerator();
+
+		if (time_after_eq(jiffies, next_wakeup)) {
+			cond_resched();
+			continue;
+		}
+
+		eli->li_timer.expires = next_wakeup;
+		add_timer(&eli->li_timer);
+		prepare_to_wait(&eli->li_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+		if (time_before(jiffies, next_wakeup))
+			schedule();
+		finish_wait(&eli->li_wait_daemon, &wait);
+	}
+
+exit_thread:
+	/*
+	 * It looks like the request list is empty, but we need
+	 * to check it under the li_list_mtx lock, to prevent any
+	 * additions into it, and of course we should lock ext4_li_mtx
+	 * to atomically free the list and ext4_li_info, because at
+	 * this point another ext4 filesystem could be registering
+	 * new one.
+	 */
+	mutex_lock(&ext4_li_mtx);
+	mutex_lock(&eli->li_list_mtx);
+	if (!list_empty(&eli->li_request_list)) {
+		mutex_unlock(&eli->li_list_mtx);
+		mutex_unlock(&ext4_li_mtx);
+		goto cont_thread;
+	}
+	mutex_unlock(&eli->li_list_mtx);
+	del_timer_sync(&ext4_li_info->li_timer);
+	eli->li_task = NULL;
+	wake_up(&eli->li_wait_task);
+
+	kfree(ext4_li_info);
+	ext4_li_info = NULL;
+	mutex_unlock(&ext4_li_mtx);
+
+	return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	if (list_empty(&ext4_li_info->li_request_list))
+		return;
+
+	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+		elr = list_entry(pos, struct ext4_li_request,
+				 lr_request);
+		ext4_remove_li_request(elr);
+	}
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+	struct task_struct *t;
+
+	t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+		ext4_clear_request_list();
+		del_timer_sync(&ext4_li_info->li_timer);
+		kfree(ext4_li_info);
+		ext4_li_info = NULL;
+		printk(KERN_CRIT "EXT4: error %d creating inode table "
+				 "initialization thread\n",
+				 err);
+		return err;
+	}
+	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+
+	wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+	return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *gdp = NULL;
+
+	for (group = 0; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			continue;
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	return group;
+}
+
+static int ext4_li_info_new(void)
+{
+	struct ext4_lazy_init *eli = NULL;
+
+	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+	if (!eli)
+		return -ENOMEM;
+
+	eli->li_task = NULL;
+	INIT_LIST_HEAD(&eli->li_request_list);
+	mutex_init(&eli->li_list_mtx);
+
+	init_waitqueue_head(&eli->li_wait_daemon);
+	init_waitqueue_head(&eli->li_wait_task);
+	init_timer(&eli->li_timer);
+	eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+	ext4_li_info = eli;
+
+	return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+					    ext4_group_t start)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	unsigned long rnd;
+
+	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+	if (!elr)
+		return NULL;
+
+	elr->lr_super = sb;
+	elr->lr_sbi = sbi;
+	elr->lr_next_group = start;
+
+	/*
+	 * Randomize first schedule time of the request to
+	 * spread the inode table initialization requests
+	 * better.
+	 */
+	get_random_bytes(&rnd, sizeof(rnd));
+	elr->lr_next_sched = jiffies + (unsigned long)rnd %
+			     (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+
+	return elr;
+}
+
+static int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ret = 0;
+
+	if (sbi->s_li_request != NULL)
+		goto out;
+
+	if (first_not_zeroed == ngroups ||
+	    (sb->s_flags & MS_RDONLY) ||
+	    !test_opt(sb, INIT_INODE_TABLE)) {
+		sbi->s_li_request = NULL;
+		goto out;
+	}
+
+	if (first_not_zeroed == ngroups) {
+		sbi->s_li_request = NULL;
+		goto out;
+	}
+
+	elr = ext4_li_request_new(sb, first_not_zeroed);
+	if (!elr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&ext4_li_mtx);
+
+	if (NULL == ext4_li_info) {
+		ret = ext4_li_info_new();
+		if (ret)
+			goto out;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+
+	sbi->s_li_request = elr;
+
+	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+		ret = ext4_run_lazyinit_thread();
+		if (ret)
+			goto out;
+	}
+
+	mutex_unlock(&ext4_li_mtx);
+
+out:
+	if (ret) {
+		mutex_unlock(&ext4_li_mtx);
+		kfree(elr);
+	}
+	return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+	/*
+	 * If thread exited earlier
+	 * there's nothing to be done.
+	 */
+	if (!ext4_li_info)
+		return;
+
+	ext4_clear_request_list();
+
+	while (ext4_li_info->li_task) {
+		wake_up(&ext4_li_info->li_wait_daemon);
+		wait_event(ext4_li_info->li_wait_task,
+			   ext4_li_info->li_task == NULL);
+	}
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2568,6 +2979,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	ext4_group_t first_not_zeroed;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -2630,6 +3042,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sbi->s_mount_opt, DEBUG);
 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2909,7 +3322,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount2;
 		}
 	}
-	if (!ext4_check_descriptors(sb)) {
+	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
@@ -3130,6 +3543,10 @@ no_journal:
 		goto failed_mount4;
 	}
 
+	err = ext4_register_li_request(sb, first_not_zeroed);
+	if (err)
+		goto failed_mount4;
+
 	sbi->s_kobj.kset = ext4_kset;
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3847,6 +4264,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			enable_quota = 1;
 		}
 	}
+
+	/*
+	 * Reinitialize lazy itable initialization thread based on
+	 * current settings
+	 */
+	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+		ext4_unregister_li_request(sb);
+	else {
+		ext4_group_t first_not_zeroed;
+		first_not_zeroed = ext4_has_uninit_itable(sb);
+		ext4_register_li_request(sb, first_not_zeroed);
+	}
+
 	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL)
 		ext4_commit_super(sb, 1);
@@ -4317,6 +4747,9 @@ static int __init init_ext4_fs(void)
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+
+	ext4_li_info = NULL;
+	mutex_init(&ext4_li_mtx);
 	return 0;
 out:
 	unregister_as_ext2();
@@ -4336,6 +4769,7 @@ out4:
 
 static void __exit exit_ext4_fs(void)
 {
+	ext4_destroy_lazyinit_thread();
 	unregister_as_ext2();
 	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
-- 
cgit v1.2.3


From 857ac889cce8a486d47874db4d2f9620e7e9e5de Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:05 -0400
Subject: ext4: add interface to advertise ext4 features in sysfs

User-space should have the opportunity to check what features doest ext4
support in each particular copy. This adds easy interface by creating new
"features" directory in sys/fs/ext4/. In that directory files
advertising feature names can be created.

Add lazy_itable_init to the feature list.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  5 +++++
 fs/ext4/ialloc.c | 19 ++++++++++---------
 fs/ext4/super.c  | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 65 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0fe078d368d0..4c5fe37b237d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1574,6 +1574,11 @@ struct ext4_li_request {
 	unsigned long		lr_timeout;
 };
 
+struct ext4_features {
+	struct kobject f_kobj;
+	struct completion f_kobj_unregister;
+};
+
 /*
  * Function prototypes
  */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e428f23215c0..87d228aae6b0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1274,6 +1274,16 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 			    ext4_itable_unused_count(sb, gdp)),
 			    sbi->s_inodes_per_block);
 
+	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+		ext4_error(sb, "Something is wrong with group %u\n"
+			   "Used itable blocks: %d"
+			   "itable unused count: %u\n",
+			   group, used_blks,
+			   ext4_itable_unused_count(sb, gdp));
+		ret = 1;
+		goto out;
+	}
+
 	blk = ext4_inode_table(sb, gdp) + used_blks;
 	num = sbi->s_itb_per_group - used_blks;
 
@@ -1283,15 +1293,6 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 	if (ret)
 		goto err_out;
 
-	if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
-		ext4_error(sb, "Something is wrong with group %u\n"
-			   "Used itable blocks: %d"
-			   "Itable blocks per group: %lu\n",
-			   group, used_blks, sbi->s_itb_per_group);
-		ret = 1;
-		goto err_out;
-	}
-
 	/*
 	 * Skip zeroout if the inode table is full. But we set the ZEROED
 	 * flag anyway, because obviously, when it is full it does not need
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5066537e5a38..c5b890140d01 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -57,6 +57,7 @@ struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 struct ext4_lazy_init *ext4_li_info;
 struct mutex ext4_li_mtx;
+struct ext4_features *ext4_feat;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -709,6 +710,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	ext4_unregister_li_request(sb);
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
@@ -727,7 +729,6 @@ static void ext4_put_super(struct super_block *sb)
 	}
 
 	del_timer(&sbi->s_err_report);
-	ext4_unregister_li_request(sb);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
@@ -2416,6 +2417,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
@@ -2452,6 +2454,14 @@ static struct attribute *ext4_attrs[] = {
 	NULL,
 };
 
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+
+static struct attribute *ext4_feat_attrs[] = {
+	ATTR_LIST(lazy_itable_init),
+	NULL,
+};
+
 static ssize_t ext4_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
@@ -2480,7 +2490,6 @@ static void ext4_sb_release(struct kobject *kobj)
 	complete(&sbi->s_kobj_unregister);
 }
 
-
 static const struct sysfs_ops ext4_attr_ops = {
 	.show	= ext4_attr_show,
 	.store	= ext4_attr_store,
@@ -2492,6 +2501,17 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+static void ext4_feat_release(struct kobject *kobj)
+{
+	complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+	.default_attrs	= ext4_feat_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_feat_release,
+};
+
 /*
  * Check whether this filesystem can be mounted based on
  * the features present and the RDONLY/RDWR mount requested.
@@ -4720,6 +4740,30 @@ static struct file_system_type ext4_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+int __init ext4_init_feat_adverts(void)
+{
+	struct ext4_features *ef;
+	int ret = -ENOMEM;
+
+	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+	if (!ef)
+		goto out;
+
+	ef->f_kobj.kset = ext4_kset;
+	init_completion(&ef->f_kobj_unregister);
+	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+				   "features");
+	if (ret) {
+		kfree(ef);
+		goto out;
+	}
+
+	ext4_feat = ef;
+	ret = 0;
+out:
+	return ret;
+}
+
 static int __init init_ext4_fs(void)
 {
 	int err;
@@ -4732,6 +4776,9 @@ static int __init init_ext4_fs(void)
 	if (!ext4_kset)
 		goto out4;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+
+	err = ext4_init_feat_adverts();
+
 	err = init_ext4_mballoc();
 	if (err)
 		goto out3;
@@ -4760,6 +4807,7 @@ out1:
 out2:
 	exit_ext4_mballoc();
 out3:
+	kfree(ext4_feat);
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 out4:
-- 
cgit v1.2.3


From a31437b85aa2cbc4ea80f9ee5d603fdd5430980c Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:05 -0400
Subject: ext4: use sb_issue_zeroout in setup_new_group_blocks

Use sb_issue_zeroout to zero out inode table and descriptor table
blocks instead of old approach which involves journaling.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 46 +++++++++++++---------------------------------
 1 file changed, 13 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..2f5e347de48b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
 	}
 
 	/* Zero out all of the reserved backup group descriptor table blocks */
-	for (i = 0, bit = gdblocks + 1, block = start + bit;
-	     i < reserved_gdb; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
-
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
+	ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+			block, sbi->s_itb_per_group);
+	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+			       GFP_NOFS, BLKDEV_IFL_WAIT);
+	if (err)
+		goto exit_bh;
 
-		if (IS_ERR(gdb = bclean(handle, sb, block))) {
-			err = PTR_ERR(gdb);
-			goto exit_bh;
-		}
-		ext4_handle_dirty_metadata(handle, NULL, gdb);
-		ext4_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
 	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
 		   input->block_bitmap - start);
 	ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,23 +241,13 @@ static int setup_new_group_blocks(struct super_block *sb,
 	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
 
 	/* Zero out all of the inode table blocks */
-	for (i = 0, block = input->inode_table, bit = block - start;
-	     i < sbi->s_itb_per_group; i++, bit++, block++) {
-		struct buffer_head *it;
-
-		ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
-
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
-
-		if (IS_ERR(it = bclean(handle, sb, block))) {
-			err = PTR_ERR(it);
-			goto exit_bh;
-		}
-		ext4_handle_dirty_metadata(handle, NULL, it);
-		brelse(it);
-		ext4_set_bit(bit, bh->b_data);
-	}
+	block = input->inode_table;
+	ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+			block, sbi->s_itb_per_group);
+	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+			       GFP_NOFS, BLKDEV_IFL_WAIT);
+	if (err)
+		goto exit_bh;
 
 	if ((err = extend_or_restart_transaction(handle, 2, bh)))
 		goto exit_bh;
-- 
cgit v1.2.3


From 2407518de63a2f80d9b850fb525f35df93bbbe53 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:06 -0400
Subject: ext4: use sb_issue_zeroout in ext4_ext_zeroout

Change ext4_ext_zeroout to use sb_issue_zeroout instead of its
own approach to zero out extents.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 69 ++++++-------------------------------------------------
 1 file changed, 7 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 820278410220..a0e623055955 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2535,77 +2535,22 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
 	int ret;
-	struct bio *bio;
-	int blkbits, blocksize;
-	sector_t ee_pblock;
-	struct completion event;
-	unsigned int ee_len, len, done, offset;
 
-
-	blkbits   = inode->i_blkbits;
-	blocksize = inode->i_sb->s_blocksize;
 	ee_len    = ext4_ext_get_actual_len(ex);
 	ee_pblock = ext_pblock(ex);
 
-	/* convert ee_pblock to 512 byte sectors */
-	ee_pblock = ee_pblock << (blkbits - 9);
-
-	while (ee_len > 0) {
-
-		if (ee_len > BIO_MAX_PAGES)
-			len = BIO_MAX_PAGES;
-		else
-			len = ee_len;
-
-		bio = bio_alloc(GFP_NOIO, len);
-		if (!bio)
-			return -ENOMEM;
-
-		bio->bi_sector = ee_pblock;
-		bio->bi_bdev   = inode->i_sb->s_bdev;
-
-		done = 0;
-		offset = 0;
-		while (done < len) {
-			ret = bio_add_page(bio, ZERO_PAGE(0),
-							blocksize, offset);
-			if (ret != blocksize) {
-				/*
-				 * We can't add any more pages because of
-				 * hardware limitations.  Start a new bio.
-				 */
-				break;
-			}
-			done++;
-			offset += blocksize;
-			if (offset >= PAGE_CACHE_SIZE)
-				offset = 0;
-		}
-
-		init_completion(&event);
-		bio->bi_private = &event;
-		bio->bi_end_io = bi_complete;
-		submit_bio(WRITE, bio);
-		wait_for_completion(&event);
+	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len,
+			       GFP_NOFS, BLKDEV_IFL_WAIT);
+	if (ret > 0)
+		ret = 0;
 
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-			bio_put(bio);
-			return -EIO;
-		}
-		bio_put(bio);
-		ee_len    -= done;
-		ee_pblock += done  << (blkbits - 9);
-	}
-	return 0;
+	return ret;
 }
 
 #define EXT4_EXT_ZERO_LEN 7
-- 
cgit v1.2.3


From c41303ced67c4ebf51bf2e7d0f139155e09e0939 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <zenczykowski@gmail.com>
Date: Wed, 27 Oct 2010 21:30:06 -0400
Subject: ext4: don't update sb journal_devnum when RO dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An ext4 filesystem on a read-only device, with an external journal
which is at a different device number then recorded in the superblock
will fail to honor the read-only setting of the device and trigger
a superblock update (write).

For example:
  - ext4 on a software raid which is in read-only mode
  - external journal on a read-write device which has changed device num
  - attempt to mount with -o journal_dev=<new_number>
  - hits BUG_ON(mddev->ro = 1) in md.c

Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c5b890140d01..8a24e9be7cb0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3908,7 +3908,7 @@ static int ext4_load_journal(struct super_block *sb,
 	EXT4_SB(sb)->s_journal = journal;
 	ext4_clear_journal_err(sb, es);
 
-	if (journal_devnum &&
+	if (!really_read_only && journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
 
-- 
cgit v1.2.3


From e0d10bfa91b0a089a9e2c378b5c42f4e96171d95 Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Wed, 27 Oct 2010 21:30:06 -0400
Subject: ext4: improve llseek error handling for overly large seek offsets

The llseek system call should return EINVAL if passed a seek offset
which results in a write error.  What this maximum offset should be
depends on whether or not the huge_file file system feature is set,
and whether or not the file is extent based or not.


If the file has no "EXT4_EXTENTS_FL" flag, the maximum size which can be
written (write systemcall) is different from the maximum size which can be
sought (lseek systemcall).

For example, the following 2 cases demonstrates the differences
between the maximum size which can be written, versus the seek offset
allowed by the llseek system call:

#1: mkfs.ext3 <dev>; mount -t ext4 <dev>
#2: mkfs.ext3 <dev>; tune2fs -Oextent,huge_file <dev>; mount -t ext4 <dev>

Table. the max file size which we can write or seek
       at each filesystem feature tuning and file flag setting
+============+===============================+===============================+
| \ File flag|                               |                               |
|      \     |     !EXT4_EXTENTS_FL          |        EXT4_EXTETNS_FL        |
|case       \|                               |                               |
+------------+-------------------------------+-------------------------------+
| #1         |   write:      2194719883264   | write:       --------------   |
|            |   seek:       2199023251456   | seek:        --------------   |
+------------+-------------------------------+-------------------------------+
| #2         |   write:      4402345721856   | write:       17592186044415   |
|            |   seek:      17592186044415   | seek:        17592186044415   |
+------------+-------------------------------+-------------------------------+

The differences exist because ext4 has 2 maxbytes which are sb->s_maxbytes
(= extent-mapped maxbytes) and EXT4_SB(sb)->s_bitmap_maxbytes (= block-mapped
maxbytes).  Although generic_file_llseek uses only extent-mapped maxbytes.
(llseek of ext4_file_operations is generic_file_llseek which uses
sb->s_maxbytes.)

Therefore we create ext4 llseek function which uses 2 maxbytes.

The new own function originates from generic_file_llseek().
If the file flag, "EXT4_EXTENTS_FL" is not set, the function alters
inode->i_sb->s_maxbytes into EXT4_SB(inode->i_sb)->s_bitmap_maxbytes.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
---
 fs/ext4/dir.c  |  2 +-
 fs/ext4/ext4.h |  1 +
 fs/ext4/file.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..ece76fb6a40c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
 				struct file *filp);
 
 const struct file_operations ext4_dir_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= ext4_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext4_readdir,		/* we take BKL. needed?*/
 	.unlocked_ioctl = ext4_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4c5fe37b237d..e1c01552a3df 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2006,6 +2006,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..5a5c55ddceef 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -130,8 +130,50 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	return dquot_file_open(inode, filp);
 }
 
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes;
+
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+	else
+		maxbytes = inode->i_sb->s_maxbytes;
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		if (offset == 0) {
+			mutex_unlock(&inode->i_mutex);
+			return file->f_pos;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > maxbytes) {
+		mutex_unlock(&inode->i_mutex);
+		return -EINVAL;
+	}
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	return offset;
+}
+
 const struct file_operations ext4_file_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= ext4_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-- 
cgit v1.2.3


From 0c9169ccad4aed233fdd49e95da4eada2536a06d Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Wed, 27 Oct 2010 21:30:07 -0400
Subject: ext4: fix potential infinite loop in ext4_da_writepages()

On linux-2.6.36-rc2, if we execute the following script, we can hang
the system when the /bin/sync command is executed:

========================================================================
#!/bin/sh

echo -n "HANG UP TEST: "
/bin/dd if=/dev/zero of=/tmp/img bs=1k count=1 seek=1M 2> /dev/null
/sbin/mkfs.ext4 -Fq /tmp/img
/bin/mount -o loop -t ext4 /tmp/img /mnt
/bin/dd if=/dev/zero of=/mnt/file bs=1 count=1 \
seek=$((16*1024*1024*1024*1024-4096)) 2> /dev/null
/bin/sync
/bin/umount /mnt
echo "DONE"
exit 0
========================================================================

We can see the following backtrace if we get the kdump when this
hangup occurs:

======================================================================
kthread()
=> bdi_writeback_thread()
   => wb_do_writeback()
      => wb_writeback()
         => writeback_inodes_wb()
            => writeback_sb_inodes()
               => writeback_single_inode()
                  => ext4_da_writepages()  ---+
                                ^ infinite    |
                                |   loop      |
                                +-------------+
======================================================================

The reason why this hangup happens is described as follows:
1) We write the last extent block of the file whose size is the filesystem
   maximum size.
2) "BH_Delay" flag is set on the buffer_head of its block.
3) - the member, "m_lblk" of struct mpage_da_data is 4294967295 (UINT_MAX)
   - the member, "m_len" of struct mpage_da_data is 1
  mpage_put_bnr_to_bhs() which is called via ext4_da_writepages()
  cannot clear "BH_Delay" flag of the buffer_head because the type of
  m_lblk is ext4_lblk_t and then m_lblk + m_len is overflow.

  Therefore an infinite loop occurs because ext4_da_writepages()
  cannot write the page (which corresponds to the block) since
  "BH_Delay" flag isn't cleared.
----------------------------------------------------------------------
static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
				struct ext4_map_blocks *map)
{
...
	int blocks = map->m_len;
...
		do {
			// cur_logical = 4294967295
			// map->m_lblk = 4294967295
			// blocks = 1
			// *** map->m_lblk + blocks == 0 (OVERFLOW!) ***
			// (cur_logical >= map->m_lblk + blocks) => true
			if (cur_logical >= map->m_lblk + blocks)
				break;
----------------------------------------------------------------------

NOTE: Mounting with the nodelalloc option will avoid this codepath,
and thus, avoid this hang

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50f3bba68a25..1e824a3ec538 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2105,7 +2105,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
 			} while ((bh = bh->b_this_page) != head);
 
 			do {
-				if (cur_logical >= map->m_lblk + blocks)
+				if (cur_logical > map->m_lblk + (blocks - 1))
 					break;
 
 				if (buffer_delay(bh) || buffer_unwritten(bh)) {
-- 
cgit v1.2.3


From 3e1e5f501632460184a98237d5460c521510535e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:07 -0400
Subject: ext4: don't use ext4_allocation_contexts for tracing

Many tracepoints were populating an ext4_allocation_context
to pass in, but this requires a slab allocation even when
tracepoints are off.  In fact, 4 of 5 of these allocations
were only for tracing.  In addition, we were only using a
small fraction of the 144 bytes of this structure for this
purpose.

We can do away with all these alloc/frees of the ac and
simply pass in the bits we care about, instead.

I tested this by turning on tracing and running through
xfstests on x86_64.  I did not actually do anything with
the trace output, however.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c           | 81 +++++++--------------------------------------
 include/trace/events/ext4.h | 51 ++++++++++++++++------------
 2 files changed, 41 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ccdfec6acb75..3f62df50f483 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3591,8 +3591,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-			struct ext4_prealloc_space *pa,
-			struct ext4_allocation_context *ac)
+			struct ext4_prealloc_space *pa)
 {
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3610,11 +3609,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = pa->pa_inode;
-	}
-
 	while (bit < end) {
 		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
 		if (bit >= end)
@@ -3625,16 +3619,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			 (unsigned) next - bit, (unsigned) group);
 		free += next - bit;
 
-		if (ac) {
-			ac->ac_b_ex.fe_group = group;
-			ac->ac_b_ex.fe_start = bit;
-			ac->ac_b_ex.fe_len = next - bit;
-			ac->ac_b_ex.fe_logical = 0;
-			trace_ext4_mballoc_discard(ac);
-		}
-
-		trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-					       next - bit);
+		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+		trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
+					       grp_blk_start + bit, next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
@@ -3657,29 +3644,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-				struct ext4_prealloc_space *pa,
-				struct ext4_allocation_context *ac)
+				struct ext4_prealloc_space *pa)
 {
 	struct super_block *sb = e4b->bd_sb;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	trace_ext4_mb_release_group_pa(sb, ac, pa);
+	trace_ext4_mb_release_group_pa(sb, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
 	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
-
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = NULL;
-		ac->ac_b_ex.fe_group = group;
-		ac->ac_b_ex.fe_start = bit;
-		ac->ac_b_ex.fe_len = pa->pa_len;
-		ac->ac_b_ex.fe_logical = 0;
-		trace_ext4_mballoc_discard(ac);
-	}
+	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
 
 	return 0;
 }
@@ -3700,7 +3677,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_prealloc_space *pa, *tmp;
-	struct ext4_allocation_context *ac;
 	struct list_head list;
 	struct ext4_buddy e4b;
 	int err;
@@ -3729,9 +3705,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
 
 	INIT_LIST_HEAD(&list);
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac)
-		ac->ac_sb = sb;
 repeat:
 	ext4_lock_group(sb, group);
 	list_for_each_entry_safe(pa, tmp,
@@ -3786,9 +3759,9 @@ repeat:
 		spin_unlock(pa->pa_obj_lock);
 
 		if (pa->pa_type == MB_GROUP_PA)
-			ext4_mb_release_group_pa(&e4b, pa, ac);
+			ext4_mb_release_group_pa(&e4b, pa);
 		else
-			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
 
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3796,8 +3769,6 @@ repeat:
 
 out:
 	ext4_unlock_group(sb, group);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 	ext4_mb_unload_buddy(&e4b);
 	put_bh(bitmap_bh);
 	return free;
@@ -3818,7 +3789,6 @@ void ext4_discard_preallocations(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_prealloc_space *pa, *tmp;
-	struct ext4_allocation_context *ac;
 	ext4_group_t group = 0;
 	struct list_head list;
 	struct ext4_buddy e4b;
@@ -3834,11 +3804,6 @@ void ext4_discard_preallocations(struct inode *inode)
 
 	INIT_LIST_HEAD(&list);
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = inode;
-	}
 repeat:
 	/* first, collect all pa's in the inode */
 	spin_lock(&ei->i_prealloc_lock);
@@ -3908,7 +3873,7 @@ repeat:
 
 		ext4_lock_group(sb, group);
 		list_del(&pa->pa_group_list);
-		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
 		ext4_unlock_group(sb, group);
 
 		ext4_mb_unload_buddy(&e4b);
@@ -3917,8 +3882,6 @@ repeat:
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -4116,14 +4079,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 	struct ext4_buddy e4b;
 	struct list_head discard_list;
 	struct ext4_prealloc_space *pa, *tmp;
-	struct ext4_allocation_context *ac;
 
 	mb_debug(1, "discard locality group preallocation\n");
 
 	INIT_LIST_HEAD(&discard_list);
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac)
-		ac->ac_sb = sb;
 
 	spin_lock(&lg->lg_prealloc_lock);
 	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4175,15 +4134,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		}
 		ext4_lock_group(sb, group);
 		list_del(&pa->pa_group_list);
-		ext4_mb_release_group_pa(&e4b, pa, ac);
+		ext4_mb_release_group_pa(&e4b, pa);
 		ext4_unlock_group(sb, group);
 
 		ext4_mb_unload_buddy(&e4b);
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -4547,7 +4504,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
 	unsigned long freed = 0;
 	unsigned int overflow;
@@ -4602,12 +4558,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	if (!ext4_should_writeback_data(inode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac) {
-		ac->ac_inode = inode;
-		ac->ac_sb = sb;
-	}
-
 do_more:
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4665,12 +4615,7 @@ do_more:
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	if (ac) {
-		ac->ac_b_ex.fe_group = block_group;
-		ac->ac_b_ex.fe_start = bit;
-		ac->ac_b_ex.fe_len = count;
-		trace_ext4_mballoc_free(ac);
-	}
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
 
 	err = ext4_mb_load_buddy(sb, block_group, &e4b);
 	if (err)
@@ -4741,7 +4686,5 @@ error_return:
 		dquot_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index fbdfa3a6bbd8..b5f4938d612d 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -396,11 +396,11 @@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,
 
 TRACE_EVENT(ext4_mb_release_inode_pa,
 	TP_PROTO(struct super_block *sb,
-		 struct ext4_allocation_context *ac,
+		 struct inode *inode,
 		 struct ext4_prealloc_space *pa,
 		 unsigned long long block, unsigned int count),
 
-	TP_ARGS(sb, ac, pa, block, count),
+	TP_ARGS(sb, inode, pa, block, count),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -412,8 +412,7 @@ TRACE_EVENT(ext4_mb_release_inode_pa,
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
-		__entry->ino		= (ac && ac->ac_inode) ? 
-						ac->ac_inode->i_ino : 0;
+		__entry->ino		= inode->i_ino;
 		__entry->block		= block;
 		__entry->count		= count;
 	),
@@ -425,10 +424,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa,
 
 TRACE_EVENT(ext4_mb_release_group_pa,
 	TP_PROTO(struct super_block *sb,
-		 struct ext4_allocation_context *ac,
 		 struct ext4_prealloc_space *pa),
 
-	TP_ARGS(sb, ac, pa),
+	TP_ARGS(sb, pa),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -779,47 +777,56 @@ TRACE_EVENT(ext4_mballoc_prealloc,
 );
 
 DECLARE_EVENT_CLASS(ext4__mballoc,
-	TP_PROTO(struct ext4_allocation_context *ac),
+	TP_PROTO(struct super_block *sb,
+		 struct inode *inode,
+		 ext4_group_t group,
+		 ext4_grpblk_t start,
+		 ext4_grpblk_t len),
 
-	TP_ARGS(ac),
+	TP_ARGS(sb, inode, group, start, len),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
-		__field(	__u32, 	result_logical		)
 		__field(	  int,	result_start		)
 		__field(	__u32, 	result_group		)
 		__field(	  int,	result_len		)
 	),
 
 	TP_fast_assign(
-		__entry->dev		= ac->ac_sb->s_dev;
-		__entry->ino		= ac->ac_inode ?
-						ac->ac_inode->i_ino : 0;
-		__entry->result_logical	= ac->ac_b_ex.fe_logical;
-		__entry->result_start	= ac->ac_b_ex.fe_start;
-		__entry->result_group	= ac->ac_b_ex.fe_group;
-		__entry->result_len	= ac->ac_b_ex.fe_len;
+		__entry->dev		= sb->s_dev;
+		__entry->ino		= inode ? inode->i_ino : 0;
+		__entry->result_start	= start;
+		__entry->result_group	= group;
+		__entry->result_len	= len;
 	),
 
-	TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
+	TP_printk("dev %s inode %lu extent %u/%d/%u ",
 		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
 		  __entry->result_group, __entry->result_start,
-		  __entry->result_len, __entry->result_logical)
+		  __entry->result_len)
 );
 
 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
 
-	TP_PROTO(struct ext4_allocation_context *ac),
+	TP_PROTO(struct super_block *sb,
+		 struct inode *inode,
+		 ext4_group_t group,
+		 ext4_grpblk_t start,
+		 ext4_grpblk_t len),
 
-	TP_ARGS(ac)
+	TP_ARGS(sb, inode, group, start, len)
 );
 
 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
 
-	TP_PROTO(struct ext4_allocation_context *ac),
+	TP_PROTO(struct super_block *sb,
+		 struct inode *inode,
+		 ext4_group_t group,
+		 ext4_grpblk_t start,
+		 ext4_grpblk_t len),
 
-	TP_ARGS(ac)
+	TP_ARGS(sb, inode, group, start, len)
 );
 
 TRACE_EVENT(ext4_forget,
-- 
cgit v1.2.3


From c999af2b347a55174f702702e0df814d05ef5491 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:07 -0400
Subject: ext4: queue conversion after adding to inode's completed IO list

By queuing the io end on the unwritten workqueue before adding it
to our inode's list of completed IOs, I think we run the risk
of the work getting completed, and the IO freed, before we try
to add it to the inode's i_completed_io_list.

It should be safe to add it to the inode's list of completed
IOs, and -then- queue it for completion, I think.

Thanks to Dave Chinner for pointing out the race.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e824a3ec538..670ab15e4f9a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3850,14 +3850,14 @@ out:
 	}
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
-	/* queue the work to convert unwritten extents to written */
-	queue_work(wq, &io_end->work);
-
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
 	iocb->private = NULL;
 }
 
-- 
cgit v1.2.3


From 640e9396566a1e1f52f2db294755a23f1e62cc97 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:08 -0400
Subject: ext4: remove unused ext4_sb_info members

Not that these take up a lot of room, but the structure is long enough
as it is, and there's no need to confuse people with these various
undocumented & unused structure members...

Signed-off-by: Eric Sandeen <sandeen@redaht.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e1c01552a3df..22833691e98c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1089,7 +1089,6 @@ struct ext4_sb_info {
 	struct completion s_kobj_unregister;
 
 	/* Journaling */
-	struct inode *s_journal_inode;
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
@@ -1122,10 +1121,7 @@ struct ext4_sb_info {
 	/* for buddy allocator */
 	struct ext4_group_info ***s_group_info;
 	struct inode *s_buddy_cache;
-	long s_blocks_reserved;
-	spinlock_t s_reserve_lock;
 	spinlock_t s_md_lock;
-	tid_t s_last_transaction;
 	unsigned short *s_mb_offsets;
 	unsigned int *s_mb_maxs;
 
@@ -1143,7 +1139,6 @@ struct ext4_sb_info {
 	unsigned long s_mb_last_start;
 
 	/* stats for buddy allocator */
-	spinlock_t s_mb_pa_lock;
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
 	atomic_t s_bal_success;	/* we found long enough chunks */
 	atomic_t s_bal_allocated;	/* in blocks */
-- 
cgit v1.2.3


From 8941ec8bb6443d28d5c25311870aeaa809cf1538 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:08 -0400
Subject: ext4: avoid uninitialized memory references in
 ext3_htree_next_block()

If the first block of htree directory is missing '.' or '..' but is
otherwise a valid directory, and we do a lookup for '.' or '..', it's
possible to dereference an uninitialized memory pointer in
ext4_htree_next_block().

We avoid this by moving the special case from ext4_dx_find_entry() to
ext4_find_entry(); this also means we can optimize ext4_find_entry()
slightly when NFS looks up "..".

Thanks to Brad Spengler for pointing a Clang warning that led me to
look more closely at this code.  The warning was harmless, but it was
useful in pointing out code that was too ugly to live.  This warning was
also reported by Roman Borisov.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Brad Spengler <spender@grsecurity.net>
---
 fs/ext4/namei.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..213523803dff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
 	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block, b;
+	const u8 *name = d_name->name;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	int ra_ptr = 0;		/* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == '0')) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
 	if (is_dx(dir)) {
 		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
 		/*
@@ -960,9 +971,8 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 		       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-	struct super_block * sb;
+	struct super_block * sb = dir->i_sb;
 	struct dx_hash_info	hinfo;
-	u32 hash;
 	struct dx_frame frames[2], *frame;
 	struct ext4_dir_entry_2 *de, *top;
 	struct buffer_head *bh;
@@ -971,18 +981,8 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 	int namelen = d_name->len;
 	const u8 *name = d_name->name;
 
-	sb = dir->i_sb;
-	/* NFS may look up ".." - look at dx_root directory block */
-	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-			return NULL;
-	} else {
-		frame = frames;
-		frame->bh = NULL;			/* for dx_release() */
-		frame->at = (struct dx_entry *)frames;	/* hack for zero entry*/
-		dx_set_block(frame->at, 0);		/* dx_root block is 0 */
-	}
-	hash = hinfo.hash;
+	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+		return NULL;
 	do {
 		block = dx_get_block(frame->at);
 		if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
@@ -1008,7 +1008,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		}
 		brelse(bh);
 		/* Check to see if we should continue to search */
-		retval = ext4_htree_next_block(dir, hash, frame,
+		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
 			ext4_warning(sb,
-- 
cgit v1.2.3


From 7845c0497536c566bfef08db1a38ae1ad2c25464 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:08 -0400
Subject: ext4: use search_dirblock() in ext4_dx_find_entry()

Use the search_dirblock() in ext4_dx_find_entry().  It makes the code
easier to read, and it takes advantage of common code.  It also saves
100 bytes or so of text space.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Brad Spengler <spender@grsecurity.net>
---
 fs/ext4/namei.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 213523803dff..86a7870babbd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -974,39 +974,30 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 	struct super_block * sb = dir->i_sb;
 	struct dx_hash_info	hinfo;
 	struct dx_frame frames[2], *frame;
-	struct ext4_dir_entry_2 *de, *top;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
-	int namelen = d_name->len;
-	const u8 *name = d_name->name;
 
 	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
 		return NULL;
 	do {
 		block = dx_get_block(frame->at);
-		if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+		if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
 			goto errout;
-		de = (struct ext4_dir_entry_2 *) bh->b_data;
-		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-				  + ((char *) de - bh->b_data);
-
-			if (!ext4_check_dir_entry(dir, de, bh, off)) {
-				brelse(bh);
-				*err = ERR_BAD_DX_DIR;
-				goto errout;
-			}
 
-			if (ext4_match(namelen, name, de)) {
-				*res_dir = de;
-				dx_release(frames);
-				return bh;
-			}
+		retval = search_dirblock(bh, dir, d_name,
+					 block << EXT4_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) { 	/* Success! */
+			dx_release(frames);
+			return bh;
 		}
 		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
 		/* Check to see if we should continue to search */
 		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
 					       frames, NULL);
-- 
cgit v1.2.3


From 16828088f9e518158edecb6cde7e6fa38e4c889b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:09 -0400
Subject: ext4: use KMEM_CACHE instead of kmem_cache_create

Also remove the SLAB_RECLAIM_ACCOUNT flag from the system zone kmem
cache.  This slab tends to be fairly static, so it shouldn't be marked
as likely to have free pages that can be reclaimed.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/block_validity.c |  3 +--
 fs/ext4/mballoc.c        | 18 ++++++------------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..68bab70dd139 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -31,8 +31,7 @@ static struct kmem_cache *ext4_system_zone_cachep;
 
 int __init init_ext4_system_zone(void)
 {
-	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
-					     SLAB_RECLAIM_ACCOUNT);
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
 	if (ext4_system_zone_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3f62df50f483..d732ef5a835d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2706,26 +2706,20 @@ static void ext4_remove_debugfs_entry(void)
 
 int __init init_ext4_mballoc(void)
 {
-	ext4_pspace_cachep =
-		kmem_cache_create("ext4_prealloc_space",
-				     sizeof(struct ext4_prealloc_space),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+					SLAB_RECLAIM_ACCOUNT);
 	if (ext4_pspace_cachep == NULL)
 		return -ENOMEM;
 
-	ext4_ac_cachep =
-		kmem_cache_create("ext4_alloc_context",
-				     sizeof(struct ext4_allocation_context),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+				    SLAB_RECLAIM_ACCOUNT);
 	if (ext4_ac_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 	}
 
-	ext4_free_ext_cachep =
-		kmem_cache_create("ext4_free_block_extents",
-				     sizeof(struct ext4_free_data),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+					  SLAB_RECLAIM_ACCOUNT);
 	if (ext4_free_ext_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		kmem_cache_destroy(ext4_ac_cachep);
-- 
cgit v1.2.3


From 5a87b7a5da250c9be6d757758425dfeaf8ed3179 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:09 -0400
Subject: ext4: call mpage_da_submit_io() from mpage_da_map_blocks()

Eventually we need to completely reorganize the ext4 writepage
callpath, but for now, we simplify things a little by calling
mpage_da_submit_io() from mpage_da_map_blocks(), since all of the
places where we call mpage_da_map_blocks() it is followed up by a call
to mpage_da_submit_io().

We're also a wee bit better with respect to error handling, but there
are still a number of issues where it's not clear what the right thing
is to do with ext4 functions deep in the writeback codepath fails.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 66 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 670ab15e4f9a..55961ff4efc2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -60,6 +60,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int ext4_writepage(struct page *page, struct writeback_control *wbc);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -2033,7 +2034,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 			BUG_ON(PageWriteback(page));
 
 			pages_skipped = mpd->wbc->pages_skipped;
-			err = mapping->a_ops->writepage(page, mpd->wbc);
+			err = ext4_writepage(page, mpd->wbc);
 			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
 				/*
 				 * have successfully written the page
@@ -2189,14 +2190,15 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
  *
  * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
 	struct ext4_map_blocks map;
@@ -2206,18 +2208,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	handle_t *handle = NULL;
 
 	/*
-	 * We consider only non-mapped and non-allocated blocks
+	 * If the blocks are mapped already, or we couldn't accumulate
+	 * any blocks, then proceed immediately to the submission stage.
 	 */
-	if ((mpd->b_state  & (1 << BH_Mapped)) &&
-		!(mpd->b_state & (1 << BH_Delay)) &&
-		!(mpd->b_state & (1 << BH_Unwritten)))
-		return 0;
-
-	/*
-	 * If we didn't accumulate anything to write simply return
-	 */
-	if (!mpd->b_size)
-		return 0;
+	if ((mpd->b_size == 0) ||
+	    ((mpd->b_state  & (1 << BH_Mapped)) &&
+	     !(mpd->b_state & (1 << BH_Delay)) &&
+	     !(mpd->b_state & (1 << BH_Unwritten))))
+		goto submit_io;
 
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
@@ -2254,17 +2252,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 
 		err = blks;
 		/*
-		 * If get block returns with error we simply
-		 * return. Later writepage will redirty the page and
-		 * writepages will find the dirty page again
+		 * If get block returns EAGAIN or ENOSPC and there
+		 * appears to be free blocks we will call
+		 * ext4_writepage() for all of the pages which will
+		 * just redirty the pages.
 		 */
 		if (err == -EAGAIN)
-			return 0;
+			goto submit_io;
 
 		if (err == -ENOSPC &&
 		    ext4_count_free_blocks(sb)) {
 			mpd->retval = err;
-			return 0;
+			goto submit_io;
 		}
 
 		/*
@@ -2289,7 +2288,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
 				mpd->b_size >> mpd->inode->i_blkbits);
-		return err;
+		return;
 	}
 	BUG_ON(blks == 0);
 
@@ -2312,7 +2311,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
 		if (err)
-			return err;
+			/* This only happens if the journal is aborted */
+			return;
 	}
 
 	/*
@@ -2323,10 +2323,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		disksize = i_size_read(mpd->inode);
 	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
 		ext4_update_i_disksize(mpd->inode, disksize);
-		return ext4_mark_inode_dirty(handle, mpd->inode);
+		err = ext4_mark_inode_dirty(handle, mpd->inode);
+		if (err)
+			ext4_error(mpd->inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   mpd->inode->i_ino);
 	}
 
-	return 0;
+submit_io:
+	mpage_da_submit_io(mpd);
+	mpd->io_done = 1;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2403,9 +2409,7 @@ flush_it:
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
-	if (mpage_da_map_blocks(mpd) == 0)
-		mpage_da_submit_io(mpd);
-	mpd->io_done = 1;
+	mpage_da_map_and_submit(mpd);
 	return;
 }
 
@@ -2437,15 +2441,13 @@ static int __mpage_da_writepage(struct page *page,
 	if (mpd->next_page != page->index) {
 		/*
 		 * Nope, we can't. So, we map non-allocated blocks
-		 * and start IO on them using writepage()
+		 * and start IO on them
 		 */
 		if (mpd->next_page != mpd->first_page) {
-			if (mpage_da_map_blocks(mpd) == 0)
-				mpage_da_submit_io(mpd);
+			mpage_da_map_and_submit(mpd);
 			/*
 			 * skip rest of the page in the page_vec
 			 */
-			mpd->io_done = 1;
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return MPAGE_DA_EXTENT_TAIL;
@@ -3071,9 +3073,7 @@ retry:
 		 * them for I/O.
 		 */
 		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-			if (mpage_da_map_blocks(&mpd) == 0)
-				mpage_da_submit_io(&mpd);
-			mpd.io_done = 1;
+			mpage_da_map_and_submit(&mpd);
 			ret = MPAGE_DA_EXTENT_TAIL;
 		}
 		trace_ext4_da_write_pages(inode, &mpd);
-- 
cgit v1.2.3


From a42afc5f56f319107e987aa6adf2f65d93d527c7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:09 -0400
Subject: ext4: simplify ext4_writepage()

The actual code in ext4_writepage() is unnecessarily convoluted.
Simplify it so it is easier to understand, but otherwise logically
equivalent.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 73 ++++++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 55961ff4efc2..a08ec795995f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2704,7 +2704,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
 			  struct writeback_control *wbc)
 {
-	int ret = 0;
+	int ret = 0, commit_write = 0;
 	loff_t size;
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
@@ -2717,60 +2717,37 @@ static int ext4_writepage(struct page *page,
 	else
 		len = PAGE_CACHE_SIZE;
 
-	if (page_has_buffers(page)) {
-		page_bufs = page_buffers(page);
-		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-					ext4_bh_delay_or_unwritten)) {
-			/*
-			 * We don't want to do  block allocation
-			 * So redirty the page and return
-			 * We may reach here when we do a journal commit
-			 * via journal_submit_inode_data_buffers.
-			 * If we don't have mapping block we just ignore
-			 * them. We can also reach here via shrink_page_list
-			 */
+	/*
+	 * If the page does not have buffers (for whatever reason),
+	 * try to create them using block_prepare_write.  If this
+	 * fails, redirty the page and move on.
+	 */
+	if (!page_buffers(page)) {
+		if (block_prepare_write(page, 0, len,
+					noalloc_get_block_write)) {
+		redirty_page:
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return 0;
 		}
-	} else {
+		commit_write = 1;
+	}
+	page_bufs = page_buffers(page);
+	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+			      ext4_bh_delay_or_unwritten)) {
 		/*
-		 * The test for page_has_buffers() is subtle:
-		 * We know the page is dirty but it lost buffers. That means
-		 * that at some moment in time after write_begin()/write_end()
-		 * has been called all buffers have been clean and thus they
-		 * must have been written at least once. So they are all
-		 * mapped and we can happily proceed with mapping them
-		 * and writing the page.
-		 *
-		 * Try to initialize the buffer_heads and check whether
-		 * all are mapped and non delay. We don't want to
-		 * do block allocation here.
+		 * We don't want to do block allocation So redirty the
+		 * page and return We may reach here when we do a
+		 * journal commit via
+		 * journal_submit_inode_data_buffers.  If we don't
+		 * have mapping block we just ignore them. We can also
+		 * reach here via shrink_page_list
 		 */
-		ret = block_prepare_write(page, 0, len,
-					  noalloc_get_block_write);
-		if (!ret) {
-			page_bufs = page_buffers(page);
-			/* check whether all are mapped and non delay */
-			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-						ext4_bh_delay_or_unwritten)) {
-				redirty_page_for_writepage(wbc, page);
-				unlock_page(page);
-				return 0;
-			}
-		} else {
-			/*
-			 * We can't do block allocation here
-			 * so just redity the page and unlock
-			 * and return
-			 */
-			redirty_page_for_writepage(wbc, page);
-			unlock_page(page);
-			return 0;
-		}
+		goto redirty_page;
+	}
+	if (commit_write)
 		/* now mark the buffer_heads as dirty and uptodate */
 		block_commit_write(page, 0, len);
-	}
 
 	if (PageChecked(page) && ext4_should_journal_data(inode)) {
 		/*
@@ -2781,7 +2758,7 @@ static int ext4_writepage(struct page *page,
 		return __ext4_journalled_writepage(page, len);
 	}
 
-	if (page_bufs && buffer_uninit(page_bufs)) {
+	if (buffer_uninit(page_bufs)) {
 		ext4_set_bh_endio(page_bufs, inode);
 		ret = block_write_full_page_endio(page, noalloc_get_block_write,
 					    wbc, ext4_end_io_buffer_write);
-- 
cgit v1.2.3


From cb20d5188366f04d96d2e07b1240cc92170ade40 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:09 -0400
Subject: ext4: inline ext4_writepage() into mpage_da_submit_io()

As a prepratory step to switching to bio_submit, inline
ext4_writepage() into mpage_da_submit() and then simplify things a
bit.  This makes it clearer what mpage_da_submit needs to do.

Also, move the ClearPageChecked(page) call into
__ext4_journalled_writepage(), as a minor bit of cleanup refactoring.

This also allows us to pull i_size_read() and
ext4_should_journal_data() out of the loop, which should be a very
minor CPU savings.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a08ec795995f..97a0c35219ae 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -60,7 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int ext4_writepage(struct page *page, struct writeback_control *wbc);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -2000,12 +2005,15 @@ static void ext4_da_page_release_reservation(struct page *page,
  */
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
-	long pages_skipped;
 	struct pagevec pvec;
 	unsigned long index, end;
 	int ret = 0, err, nr_pages, i;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
+	loff_t size = i_size_read(inode);
+	unsigned int len;
+	struct buffer_head *page_bufs = NULL;
+	int journal_data = ext4_should_journal_data(inode);
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	/*
@@ -2023,28 +2031,69 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
+			int commit_write = 0;
 			struct page *page = pvec.pages[i];
 
 			index = page->index;
 			if (index > end)
 				break;
+
+			if (index == size >> PAGE_CACHE_SHIFT)
+				len = size & ~PAGE_CACHE_MASK;
+			else
+				len = PAGE_CACHE_SIZE;
 			index++;
 
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 
-			pages_skipped = mpd->wbc->pages_skipped;
-			err = ext4_writepage(page, mpd->wbc);
-			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+			/*
+			 * If the page does not have buffers (for
+			 * whatever reason), try to create them using
+			 * block_prepare_write.  If this fails,
+			 * redirty the page and move on.
+			 */
+			if (!page_has_buffers(page)) {
+				if (block_prepare_write(page, 0, len,
+						noalloc_get_block_write)) {
+				redirty_page:
+					redirty_page_for_writepage(mpd->wbc,
+								   page);
+					unlock_page(page);
+					continue;
+				}
+				commit_write = 1;
+			}
+			page_bufs = page_buffers(page);
+			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+					      ext4_bh_delay_or_unwritten)) {
 				/*
-				 * have successfully written the page
-				 * without skipping the same
+				 * We couldn't do block allocation for
+				 * some reason.
 				 */
+				goto redirty_page;
+			}
+
+			if (commit_write)
+				/* mark the buffer_heads as dirty & uptodate */
+				block_commit_write(page, 0, len);
+
+			if (journal_data && PageChecked(page))
+				err = __ext4_journalled_writepage(page, len);
+			else if (buffer_uninit(page_bufs)) {
+				ext4_set_bh_endio(page_bufs, inode);
+				err = block_write_full_page_endio(page,
+					noalloc_get_block_write,
+					mpd->wbc, ext4_end_io_buffer_write);
+			} else
+				err = block_write_full_page(page,
+					    noalloc_get_block_write, mpd->wbc);
+
+			if (!err)
 				mpd->pages_written++;
 			/*
 			 * In error case, we have to continue because
 			 * remaining pages are still locked
-			 * XXX: unlock and re-dirty them?
 			 */
 			if (ret == 0)
 				ret = err;
@@ -2627,6 +2676,7 @@ static int __ext4_journalled_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
+	ClearPageChecked(page);
 	page_bufs = page_buffers(page);
 	BUG_ON(!page_bufs);
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2749,14 +2799,12 @@ static int ext4_writepage(struct page *page,
 		/* now mark the buffer_heads as dirty and uptodate */
 		block_commit_write(page, 0, len);
 
-	if (PageChecked(page) && ext4_should_journal_data(inode)) {
+	if (PageChecked(page) && ext4_should_journal_data(inode))
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
-		ClearPageChecked(page);
 		return __ext4_journalled_writepage(page, len);
-	}
 
 	if (buffer_uninit(page_bufs)) {
 		ext4_set_bh_endio(page_bufs, inode);
-- 
cgit v1.2.3


From 3ecdb3a193a5f224f084c04a63aa28cdccf4d7d0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:10 -0400
Subject: ext4: inline walk_page_buffers() into mpage_da_submit_io

Expand the call:

  if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
                        ext4_bh_delay_or_unwritten))
	goto redirty_page

into mpage_da_submit_io().

This will allow us to merge in mpage_put_bnr_to_bhs() in the next
patch.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 97a0c35219ae..5da6cfcecd83 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2011,8 +2011,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t size = i_size_read(inode);
-	unsigned int len;
-	struct buffer_head *page_bufs = NULL;
+	unsigned int len, block_start;
+	struct buffer_head *bh, *page_bufs = NULL;
 	int journal_data = ext4_should_journal_data(inode);
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
@@ -2064,15 +2064,17 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 				}
 				commit_write = 1;
 			}
-			page_bufs = page_buffers(page);
-			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-					      ext4_bh_delay_or_unwritten)) {
-				/*
-				 * We couldn't do block allocation for
-				 * some reason.
-				 */
-				goto redirty_page;
-			}
+
+			bh = page_bufs = page_buffers(page);
+			block_start = 0;
+			do {
+				/* redirty page if block allocation undone */
+				if (!bh || buffer_delay(bh) ||
+				    buffer_unwritten(bh))
+					goto redirty_page;
+				bh = bh->b_this_page;
+				block_start += bh->b_size;
+			} while ((bh != page_bufs) && (block_start < len));
 
 			if (commit_write)
 				/* mark the buffer_heads as dirty & uptodate */
-- 
cgit v1.2.3


From 1de3e3df917459422cb2aecac440febc8879d410 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:10 -0400
Subject: ext4: move mpage_put_bnr_to_bhs()'s functionality to
 mpage_da_submit_io()

This massively simplifies the ext4_da_writepages() code path by
completely removing mpage_put_bnr_bhs(), which is almost 100 lines of
code iterating over a set of pages using pagevec_lookup(), and folds
that functionality into mpage_da_submit_io()'s existing
pagevec_lookup() loop.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 139 ++++++++++++++++----------------------------------------
 1 file changed, 38 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5da6cfcecd83..c65d647378f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2003,7 +2003,8 @@ static void ext4_da_page_release_reservation(struct page *page,
  *
  * As pages are already locked by write_cache_pages(), we can't use it
  */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+			      struct ext4_map_blocks *map)
 {
 	struct pagevec pvec;
 	unsigned long index, end;
@@ -2014,6 +2015,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	unsigned int len, block_start;
 	struct buffer_head *bh, *page_bufs = NULL;
 	int journal_data = ext4_should_journal_data(inode);
+	sector_t pblock = 0, cur_logical = 0;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	/*
@@ -2031,7 +2033,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
-			int commit_write = 0;
+			int commit_write = 0, redirty_page = 0;
 			struct page *page = pvec.pages[i];
 
 			index = page->index;
@@ -2042,6 +2044,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 				len = size & ~PAGE_CACHE_MASK;
 			else
 				len = PAGE_CACHE_SIZE;
+			if (map) {
+				cur_logical = index << (PAGE_CACHE_SHIFT -
+							inode->i_blkbits);
+				pblock = map->m_pblk + (cur_logical -
+							map->m_lblk);
+			}
 			index++;
 
 			BUG_ON(!PageLocked(page));
@@ -2068,13 +2076,34 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 			bh = page_bufs = page_buffers(page);
 			block_start = 0;
 			do {
-				/* redirty page if block allocation undone */
-				if (!bh || buffer_delay(bh) ||
-				    buffer_unwritten(bh))
+				if (!bh)
 					goto redirty_page;
+				if (map && (cur_logical >= map->m_lblk) &&
+				    (cur_logical <= (map->m_lblk +
+						     (map->m_len - 1)))) {
+					if (buffer_delay(bh)) {
+						clear_buffer_delay(bh);
+						bh->b_blocknr = pblock;
+					}
+					if (buffer_unwritten(bh) ||
+					    buffer_mapped(bh))
+						BUG_ON(bh->b_blocknr != pblock);
+					if (map->m_flags & EXT4_MAP_UNINIT)
+						set_buffer_uninit(bh);
+					clear_buffer_unwritten(bh);
+				}
+
+				/* redirty page if block allocation undone */
+				if (buffer_delay(bh) || buffer_unwritten(bh))
+					redirty_page = 1;
 				bh = bh->b_this_page;
 				block_start += bh->b_size;
-			} while ((bh != page_bufs) && (block_start < len));
+				cur_logical++;
+				pblock++;
+			} while (bh != page_bufs);
+
+			if (redirty_page)
+				goto redirty_page;
 
 			if (commit_write)
 				/* mark the buffer_heads as dirty & uptodate */
@@ -2105,91 +2134,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	return ret;
 }
 
-/*
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-				 struct ext4_map_blocks *map)
-{
-	struct inode *inode = mpd->inode;
-	struct address_space *mapping = inode->i_mapping;
-	int blocks = map->m_len;
-	sector_t pblock = map->m_pblk, cur_logical;
-	struct buffer_head *head, *bh;
-	pgoff_t index, end;
-	struct pagevec pvec;
-	int nr_pages, i;
-
-	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-	pagevec_init(&pvec, 0);
-
-	while (index <= end) {
-		/* XXX: optimize tail */
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			index = page->index;
-			if (index > end)
-				break;
-			index++;
-
-			BUG_ON(!PageLocked(page));
-			BUG_ON(PageWriteback(page));
-			BUG_ON(!page_has_buffers(page));
-
-			bh = page_buffers(page);
-			head = bh;
-
-			/* skip blocks out of the range */
-			do {
-				if (cur_logical >= map->m_lblk)
-					break;
-				cur_logical++;
-			} while ((bh = bh->b_this_page) != head);
-
-			do {
-				if (cur_logical > map->m_lblk + (blocks - 1))
-					break;
-
-				if (buffer_delay(bh) || buffer_unwritten(bh)) {
-
-					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-
-					if (buffer_delay(bh)) {
-						clear_buffer_delay(bh);
-						bh->b_blocknr = pblock;
-					} else {
-						/*
-						 * unwritten already should have
-						 * blocknr assigned. Verify that
-						 */
-						clear_buffer_unwritten(bh);
-						BUG_ON(bh->b_blocknr != pblock);
-					}
-
-				} else if (buffer_mapped(bh))
-					BUG_ON(bh->b_blocknr != pblock);
-
-				if (map->m_flags & EXT4_MAP_UNINIT)
-					set_buffer_uninit(bh);
-				cur_logical++;
-				pblock++;
-			} while ((bh = bh->b_this_page) != head);
-		}
-		pagevec_release(&pvec);
-	}
-}
-
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 					sector_t logical, long blk_cnt)
 {
@@ -2252,7 +2196,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
-	struct ext4_map_blocks map;
+	struct ext4_map_blocks map, *mapp = NULL;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2343,6 +2287,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 	}
 	BUG_ON(blks == 0);
 
+	mapp = &map;
 	if (map.m_flags & EXT4_MAP_NEW) {
 		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
 		int i;
@@ -2351,14 +2296,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 			unmap_underlying_metadata(bdev, map.m_pblk + i);
 	}
 
-	/*
-	 * If blocks are delayed marked, we need to
-	 * put actual blocknr and drop delayed bit
-	 */
-	if ((mpd->b_state & (1 << BH_Delay)) ||
-	    (mpd->b_state & (1 << BH_Unwritten)))
-		mpage_put_bnr_to_bhs(mpd, &map);
-
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
 		if (err)
@@ -2382,7 +2319,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 	}
 
 submit_io:
-	mpage_da_submit_io(mpd);
+	mpage_da_submit_io(mpd, mapp);
 	mpd->io_done = 1;
 }
 
-- 
cgit v1.2.3


From bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:10 -0400
Subject: ext4: use bio layer instead of buffer layer in mpage_da_submit_io

Call the block I/O layer directly instad of going through the buffer
layer.  This should give us much better performance and scalability,
as well as lowering our CPU utilization when doing buffered writeback.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Makefile  |   2 +-
 fs/ext4/ext4.h    |  36 ++++-
 fs/ext4/extents.c |   4 +-
 fs/ext4/inode.c   | 118 ++-------------
 fs/ext4/page-io.c | 430 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   |   8 +-
 6 files changed, 489 insertions(+), 109 deletions(-)
 create mode 100644 fs/ext4/page-io.c

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..c947e36eda6c 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 22833691e98c..ca9fda64dd4f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -168,7 +168,20 @@ struct mpage_da_data {
 	int pages_written;
 	int retval;
 };
-#define	EXT4_IO_UNWRITTEN	0x1
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define	EXT4_IO_END_UNWRITTEN	0x0001
+#define EXT4_IO_END_ERROR	0x0002
+
+struct ext4_io_page {
+	struct page	*p_page;
+	int		p_count;
+};
+
+#define MAX_IO_PAGES 128
+
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished IO list */
 	struct inode		*inode;		/* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
 	struct work_struct	work;		/* data work queue */
 	struct kiocb		*iocb;		/* iocb struct for AIO */
 	int			result;		/* error value for AIO */
+	int			num_io_pages;
+	struct ext4_io_page	*pages[MAX_IO_PAGES];
 } ext4_io_end_t;
 
+struct ext4_io_submit {
+	int			io_op;
+	struct bio		*io_bio;
+	ext4_io_end_t		*io_end;
+	struct ext4_io_page	*io_page;
+	sector_t		io_next_block;
+};
+
 /*
  * Special inodes numbers
  */
@@ -2044,6 +2067,17 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
 
+/* page-io.c */
+extern int __init init_ext4_pageio(void);
+extern void exit_ext4_pageio(void);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+			       struct page *page,
+			       int len,
+			       struct writeback_control *wbc);
 
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a0e623055955..a1e20c8c4e0c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3202,7 +3202,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * completed
 		 */
 		if (io)
-			io->flag = EXT4_IO_UNWRITTEN;
+			io->flag = EXT4_IO_END_UNWRITTEN;
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
@@ -3494,7 +3494,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 			if (io)
-				io->flag = EXT4_IO_UNWRITTEN;
+				io->flag = EXT4_IO_END_UNWRITTEN;
 			else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c65d647378f9..58604fe11f4f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2016,8 +2016,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 	struct buffer_head *bh, *page_bufs = NULL;
 	int journal_data = ext4_should_journal_data(inode);
 	sector_t pblock = 0, cur_logical = 0;
+	struct ext4_io_submit io_submit;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
+	memset(&io_submit, 0, sizeof(io_submit));
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
@@ -2109,16 +2111,16 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 				/* mark the buffer_heads as dirty & uptodate */
 				block_commit_write(page, 0, len);
 
-			if (journal_data && PageChecked(page))
+			/*
+			 * Delalloc doesn't support data journalling,
+			 * but eventually maybe we'll lift this
+			 * restriction.
+			 */
+			if (unlikely(journal_data && PageChecked(page)))
 				err = __ext4_journalled_writepage(page, len);
-			else if (buffer_uninit(page_bufs)) {
-				ext4_set_bh_endio(page_bufs, inode);
-				err = block_write_full_page_endio(page,
-					noalloc_get_block_write,
-					mpd->wbc, ext4_end_io_buffer_write);
-			} else
-				err = block_write_full_page(page,
-					    noalloc_get_block_write, mpd->wbc);
+			else
+				err = ext4_bio_write_page(&io_submit, page,
+							  len, mpd->wbc);
 
 			if (!err)
 				mpd->pages_written++;
@@ -2131,6 +2133,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 		}
 		pagevec_release(&pvec);
 	}
+	ext4_io_submit(&io_submit);
 	return ret;
 }
 
@@ -3426,15 +3429,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-	BUG_ON(!io);
-	if (io->page)
-		put_page(io->page);
-	iput(io->inode);
-	kfree(io);
-}
-
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh;
@@ -3639,68 +3633,6 @@ static void dump_completed_IO(struct inode * inode)
 #endif
 }
 
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-	struct inode *inode = io->inode;
-	loff_t offset = io->offset;
-	ssize_t size = io->size;
-	int ret = 0;
-
-	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-		   "list->prev 0x%p\n",
-	           io, inode->i_ino, io->list.next, io->list.prev);
-
-	if (list_empty(&io->list))
-		return ret;
-
-	if (io->flag != EXT4_IO_UNWRITTEN)
-		return ret;
-
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		printk(KERN_EMERG "%s: failed to convert unwritten"
-			"extents to written extents, error is %d"
-			" io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-		return ret;
-	}
-
-	if (io->iocb)
-		aio_complete(io->iocb, io->result, 0);
-	/* clear the DIO AIO unwritten flag */
-	io->flag = 0;
-	return ret;
-}
-
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
-	struct inode		*inode = io->inode;
-	struct ext4_inode_info	*ei = EXT4_I(inode);
-	unsigned long		flags;
-	int			ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = ext4_end_io_nolock(io);
-	if (ret < 0) {
-		mutex_unlock(&inode->i_mutex);
-		return;
-	}
-
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	if (!list_empty(&io->list))
-		list_del_init(&io->list);
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-	mutex_unlock(&inode->i_mutex);
-	ext4_free_io_end(io);
-}
-
 /*
  * This function is called from ext4_sync_file().
  *
@@ -3756,28 +3688,6 @@ int flush_completed_IO(struct inode *inode)
 	return (ret2 < 0) ? ret2 : 0;
 }
 
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-	ext4_io_end_t *io = NULL;
-
-	io = kmalloc(sizeof(*io), flags);
-
-	if (io) {
-		igrab(inode);
-		io->inode = inode;
-		io->flag = 0;
-		io->offset = 0;
-		io->size = 0;
-		io->page = NULL;
-		io->iocb = NULL;
-		io->result = 0;
-		INIT_WORK(&io->work, ext4_end_io_work);
-		INIT_LIST_HEAD(&io->list);
-	}
-
-	return io;
-}
-
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private, int ret,
 			    bool is_async)
@@ -3797,7 +3707,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (io_end->flag != EXT4_IO_UNWRITTEN){
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
 out:
@@ -3842,7 +3752,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 		goto out;
 	}
 
-	io_end->flag = EXT4_IO_UNWRITTEN;
+	io_end->flag = EXT4_IO_END_UNWRITTEN;
 	inode = io_end->inode;
 
 	/* Add the io_end to per-inode completed io list*/
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..b972ca50f851
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,430 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+
+int __init init_ext4_pageio(void)
+{
+	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+	if (io_page_cachep == NULL)
+		return -ENOMEM;
+	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+	if (io_page_cachep == NULL) {
+		kmem_cache_destroy(io_page_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void exit_ext4_pageio(void)
+{
+	kmem_cache_destroy(io_end_cachep);
+	kmem_cache_destroy(io_page_cachep);
+}
+
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+	int i;
+
+	BUG_ON(!io);
+	if (io->page)
+		put_page(io->page);
+	for (i = 0; i < io->num_io_pages; i++) {
+		if (--io->pages[i]->p_count == 0) {
+			struct page *page = io->pages[i]->p_page;
+
+			end_page_writeback(page);
+			put_page(page);
+			kmem_cache_free(io_page_cachep, io->pages[i]);
+		}
+	}
+	io->num_io_pages = 0;
+	iput(io->inode);
+	kmem_cache_free(io_end_cachep, io);
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	ssize_t size = io->size;
+	int ret = 0;
+
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+		   io, inode->i_ino, io->list.next, io->list.prev);
+
+	if (list_empty(&io->list))
+		return ret;
+
+	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+		return ret;
+
+	ret = ext4_convert_unwritten_extents(inode, offset, size);
+	if (ret < 0) {
+		printk(KERN_EMERG "%s: failed to convert unwritten "
+			"extents to written extents, error is %d "
+			"io is still on inode %lu aio dio list\n",
+		       __func__, ret, inode->i_ino);
+		return ret;
+	}
+
+	if (io->iocb)
+		aio_complete(io->iocb, io->result, 0);
+	/* clear the DIO AIO unwritten flag */
+	io->flag &= ~EXT4_IO_END_UNWRITTEN;
+	return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
+	struct inode		*inode = io->inode;
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	unsigned long		flags;
+	int			ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = ext4_end_io_nolock(io);
+	if (ret < 0) {
+		mutex_unlock(&inode->i_mutex);
+		return;
+	}
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (!list_empty(&io->list))
+		list_del_init(&io->list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	mutex_unlock(&inode->i_mutex);
+	ext4_free_io_end(io);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+	ext4_io_end_t *io = NULL;
+
+	io = kmem_cache_alloc(io_end_cachep, flags);
+	if (io) {
+		memset(io, 0, sizeof(*io));
+		io->inode = igrab(inode);
+		BUG_ON(!io->inode);
+		INIT_WORK(&io->work, ext4_end_io_work);
+		INIT_LIST_HEAD(&io->list);
+	}
+	return io;
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_io_end_t *io_end = bio->bi_private;
+	struct workqueue_struct *wq;
+	struct inode *inode;
+	unsigned long flags;
+	ext4_fsblk_t err_block;
+	int i;
+
+	BUG_ON(!io_end);
+	inode = io_end->inode;
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = 0;
+	err_block = bio->bi_sector >> (inode->i_blkbits - 9);
+	bio_put(bio);
+
+	if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
+		pr_err("sb umounted, discard end_io request for inode %lu\n",
+			io_end->inode->i_ino);
+		ext4_free_io_end(io_end);
+		return;
+	}
+
+	if (error) {
+		io_end->flag |= EXT4_IO_END_ERROR;
+		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long) err_block);
+	}
+
+	for (i = 0; i < io_end->num_io_pages; i++) {
+		struct page *page = io_end->pages[i]->p_page;
+		struct buffer_head *bh, *head;
+		int partial_write = 0;
+
+		head = page_buffers(page);
+		if (error)
+			SetPageError(page);
+		BUG_ON(!head);
+		if (head->b_size == PAGE_CACHE_SIZE)
+			clear_buffer_dirty(head);
+		else {
+			loff_t offset;
+			loff_t io_end_offset = io_end->offset + io_end->size;
+
+			offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+			bh = head;
+			do {
+				if ((offset >= io_end->offset) &&
+				    (offset+bh->b_size <= io_end_offset)) {
+					if (error)
+						buffer_io_error(bh);
+
+					clear_buffer_dirty(bh);
+				}
+				if (buffer_delay(bh))
+					partial_write = 1;
+				else if (!buffer_mapped(bh))
+					clear_buffer_dirty(bh);
+				else if (buffer_dirty(bh))
+					partial_write = 1;
+				offset += bh->b_size;
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+
+		if (--io_end->pages[i]->p_count == 0) {
+			struct page *page = io_end->pages[i]->p_page;
+
+			end_page_writeback(page);
+			put_page(page);
+			kmem_cache_free(io_page_cachep, io_end->pages[i]);
+		}
+
+		/*
+		 * If this is a partial write which happened to make
+		 * all buffers uptodate then we can optimize away a
+		 * bogus readpage() for the next read(). Here we
+		 * 'discover' whether the page went uptodate as a
+		 * result of this (potentially partial) write.
+		 */
+		if (!partial_write)
+			SetPageUptodate(page);
+	}
+
+	io_end->num_io_pages = 0;
+
+	/* Add the io_end to per-inode completed io list*/
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+	struct bio *bio = io->io_bio;
+
+	if (bio) {
+		bio_get(io->io_bio);
+		submit_bio(io->io_op, io->io_bio);
+		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+		bio_put(io->io_bio);
+	}
+	io->io_bio = 0;
+	io->io_op = 0;
+	io->io_end = 0;
+}
+
+static int io_submit_init(struct ext4_io_submit *io,
+			  struct inode *inode,
+			  struct writeback_control *wbc,
+			  struct buffer_head *bh)
+{
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
+	int nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio *bio;
+
+	io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_end)
+		return -ENOMEM;
+	do {
+		bio = bio_alloc(GFP_NOIO, nvecs);
+		nvecs >>= 1;
+	} while (bio == NULL);
+
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_private = io->io_end = io_end;
+	bio->bi_end_io = ext4_end_bio;
+
+	io_end->inode = inode;
+	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+
+	io->io_bio = bio;
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE);
+	io->io_next_block = bh->b_blocknr;
+	return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+			    struct ext4_io_page *io_page,
+			    struct inode *inode,
+			    struct writeback_control *wbc,
+			    struct buffer_head *bh)
+{
+	ext4_io_end_t *io_end;
+	int ret;
+
+	if (buffer_new(bh)) {
+		clear_buffer_new(bh);
+		unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+	}
+
+	if (!buffer_mapped(bh) || buffer_delay(bh)) {
+		if (!buffer_mapped(bh))
+			clear_buffer_dirty(bh);
+		if (io->io_bio)
+			ext4_io_submit(io);
+		return 0;
+	}
+
+	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+		ext4_io_submit(io);
+	}
+	if (io->io_bio == NULL) {
+		ret = io_submit_init(io, inode, wbc, bh);
+		if (ret)
+			return ret;
+	}
+	io_end = io->io_end;
+	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+	    (io_end->pages[io_end->num_io_pages-1] != io_page))
+		goto submit_and_retry;
+	if (buffer_uninit(bh))
+		io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+	io->io_end->size += bh->b_size;
+	io->io_next_block++;
+	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (ret != bh->b_size)
+		goto submit_and_retry;
+	if ((io_end->num_io_pages == 0) ||
+	    (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+		io_end->pages[io_end->num_io_pages++] = io_page;
+		io_page->p_count++;
+	}
+	return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+			struct page *page,
+			int len,
+			struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end, blocksize;
+	struct ext4_io_page *io_page;
+	struct buffer_head *bh, *head;
+	int ret = 0;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	ClearPageError(page);
+
+	io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+	if (!io_page) {
+		set_page_dirty(page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	io_page->p_page = page;
+	io_page->p_count = 0;
+	get_page(page);
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_start >= len) {
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+		if (ret) {
+			/*
+			 * We only get here on ENOMEM.  Not much else
+			 * we can do but mark the page as dirty, and
+			 * better luck next time.
+			 */
+			set_page_dirty(page);
+			break;
+		}
+	}
+	unlock_page(page);
+	/*
+	 * If the page was truncated before we could do the writeback,
+	 * or we had a memory allocation error while trying to write
+	 * the first buffer head, we won't have submitted any pages for
+	 * I/O.  In that case we need to make sure we've cleared the
+	 * PageWriteback bit from the page to prevent the system from
+	 * wedging later on.
+	 */
+	if (io_page->p_count == 0) {
+		put_page(page);
+		end_page_writeback(page);
+		kmem_cache_free(io_page_cachep, io_page);
+	}
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a24e9be7cb0..e13b3c3534d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4769,9 +4769,12 @@ static int __init init_ext4_fs(void)
 	int err;
 
 	ext4_check_flag_values();
-	err = init_ext4_system_zone();
+	err = init_ext4_pageio();
 	if (err)
 		return err;
+	err = init_ext4_system_zone();
+	if (err)
+		goto out5;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
 		goto out4;
@@ -4812,6 +4815,8 @@ out3:
 	kset_unregister(ext4_kset);
 out4:
 	exit_ext4_system_zone();
+out5:
+	exit_ext4_pageio();
 	return err;
 }
 
@@ -4827,6 +4832,7 @@ static void __exit exit_ext4_fs(void)
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 	exit_ext4_system_zone();
+	exit_ext4_pageio();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
cgit v1.2.3


From 877836905da55e8f2426234f42a89287184949e9 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Oct 2010 21:30:11 -0400
Subject: ext4: Check return value of sb_getblk() and friends

Fail block allocation if sb_getblk() returns NULL. In that case,
sb_find_get_block() also likely to fail so that it should skip
calling ext4_forget().

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c   | 5 +++++
 fs/ext4/mballoc.c | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 58604fe11f4f..077c3c9c432e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -761,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -EIO;
+			goto failed;
+		}
+
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d732ef5a835d..611c866ef3fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4537,6 +4537,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
+			if (unlikely(!tbh))
+				continue;
 			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);
 		}
-- 
cgit v1.2.3


From 77ca6cdf0ab8a42f481ec997911bc89e79138723 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:11 -0400
Subject: ext4: Use return value from sb_issue_discard()

Use return value from sb_issue_discard() as return value in
ext4_issue_discard(). Since sb_issue_discard() may result in more
serious errors than just -EOPNOTSUPP it is worth to inform user of this
function about them to handle error cases properly.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 611c866ef3fe..11c2eec386ef 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2603,7 +2603,7 @@ int ext4_mb_release(struct super_block *sb)
 	return 0;
 }
 
-static inline void ext4_issue_discard(struct super_block *sb,
+static inline int ext4_issue_discard(struct super_block *sb,
 		ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
 	int ret;
@@ -2617,6 +2617,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
 	}
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From 367a51a339020ba4d9edb0ce0f21d65bd50b00c9 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:11 -0400
Subject: fs: Add FITRIM ioctl

Adds an filesystem independent ioctl to allow implementation of file
system batched discard support. I takes fstrim_range structure as an
argument. fstrim_range is definec in the include/fs.h and its
definition is as follows.

struct fstrim_range {
	start;
	len;
	minlen;
}

start	- first Byte to trim
len	- number of Bytes to trim from start
minlen	- minimum extent length to trim, free extents shorter than this
	  number of Bytes will be ignored. This will be rounded up to fs
	  block size.

It is also possible to specify NULL as an argument. In this case the
arguments will set itself as follows:

start = 0;
len = ULLONG_MAX;
minlen = 0;

So it will trim the whole file system at one run.

After the FITRIM is done, the number of actually discarded Bytes is stored
in fstrim_range.len to give the user better insight on how much storage
space has been really released for wear-leveling.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ioctl.c         | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  8 ++++++++
 2 files changed, 47 insertions(+)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index f855ea4fc888..e92fdbb3bc3a 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -530,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_super(sb);
 }
 
+static int ioctl_fstrim(struct file *filp, void __user *argp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	struct fstrim_range range;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support trim feature, return. */
+	if (sb->s_op->trim_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	if (argp == NULL) {
+		range.start = 0;
+		range.len = ULLONG_MAX;
+		range.minlen = 0;
+	} else if (copy_from_user(&range, argp, sizeof(range)))
+		return -EFAULT;
+
+	ret = sb->s_op->trim_fs(sb, &range);
+	if (ret < 0)
+		return ret;
+
+	if ((argp != NULL) &&
+	    (copy_to_user(argp, &range, sizeof(range))))
+		return -EFAULT;
+
+	return 0;
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -580,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FITRIM:
+		error = ioctl_fstrim(filp, argp);
+		break;
+
 	case FS_IOC_FIEMAP:
 		return ioctl_fiemap(filp, arg);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b7..7008268e9b5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -32,6 +32,12 @@
 #define SEEK_END	2	/* seek relative to end of file */
 #define SEEK_MAX	SEEK_END
 
+struct fstrim_range {
+	uint64_t start;
+	uint64_t len;
+	uint64_t minlen;
+};
+
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
 	int nr_files;		/* read only */
@@ -316,6 +322,7 @@ struct inodes_stat_t {
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
 #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
+#define FITRIM		_IOWR('X', 121, struct fstrim_range)	/* Trim */
 
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
@@ -1581,6 +1588,7 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*trim_fs) (struct super_block *, struct fstrim_range *);
 };
 
 /*
-- 
cgit v1.2.3


From 7360d1731e5dc78aec867e65e55f9fb58782b5fe Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:12 -0400
Subject: ext4: Add batched discard support for ext4

Walk through allocation groups and trim all free extents. It can be
invoked through FITRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search for free extents in allocation groups specified by Byte range
start -> start+len. When the free extent is within this range, blocks
are marked as used and then trimmed. Afterwards these blocks are marked
as free in per-group bitmap.

Since fstrim is a long operation it is good to have an ability to
interrupt it by a signal. This was added by Dmitry Monakhov.
Thanks Dimitry.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   2 +
 fs/ext4/mballoc.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   |   1 +
 3 files changed, 188 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ca9fda64dd4f..98dde2b7dd6f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1694,6 +1694,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 11c2eec386ef..e3bcc06b4906 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4685,3 +4685,188 @@ error_return:
 	ext4_std_error(sb, err);
 	return;
 }
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	ret = ext4_issue_discard(sb, group, start, count);
+	if (ret)
+		ext4_std_error(sb, ret);
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @e4b:		ext4 buddy
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0;
+	ext4_group_t group;
+	int ret = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+			if (ret < 0)
+				break;
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
+
+		if ((e4b->bd_info->bb_free - count) < minblocks)
+			break;
+	}
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @range:		fstrim_range structure
+ *
+ * start:	First Byte to trim
+ * len:		number of Bytes to trim from start
+ * minlen:	minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ext4_buddy e4b;
+	ext4_group_t first_group, last_group;
+	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t cnt = 0, first_block, last_block;
+	uint64_t start, len, minlen, trimmed;
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	len = range->len >> sb->s_blocksize_bits;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+	trimmed = 0;
+
+	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	/* Determine first and last group to examine based on start and len */
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+				     &first_group, &first_block);
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+				     &last_group, &last_block);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+	last_block = EXT4_BLOCKS_PER_GROUP(sb);
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	for (group = first_group; group <= last_group; group++) {
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			break;
+		}
+
+		if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+			len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+		else
+			last_block = len;
+
+		if (e4b.bd_info->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, &e4b, first_block,
+						last_block, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
+				break;
+			}
+		}
+		ext4_mb_unload_buddy(&e4b);
+		trimmed += cnt;
+		first_block = 0;
+	}
+	range->len = trimmed * sb->s_blocksize;
+
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e13b3c3534d7..01e60aa6c478 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1189,6 +1189,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
cgit v1.2.3


From 27ee40df2b17c84aa7855907df12befe6869b7a7 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 27 Oct 2010 21:30:12 -0400
Subject: ext4: add batched_discard into ext4 feature list

Should be applied on the top of "lazy inode table initialization"
and "batched discard support" patch-sets.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 01e60aa6c478..9ce3b67b7269 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2457,9 +2457,11 @@ static struct attribute *ext4_attrs[] = {
 
 /* Features this copy of ext4 supports */
 EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
 
 static struct attribute *ext4_feat_attrs[] = {
 	ATTR_LIST(lazy_itable_init),
+	ATTR_LIST(batched_discard),
 	NULL,
 };
 
-- 
cgit v1.2.3


From bbd08344e3df8c7c1d7aa04bc0c8c9367806e12d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:12 -0400
Subject: ext4: tidy up a void argument in inode.c

This doesn't fix anything at all, it just removes a vestige
of prior use from __mpage_da_writepage()

__mpage_da_writepage() had a *void argument leftover from
its previous life as a callback; make it reflect the actual type.

Fixing this up makes it slightly more obvious to read, and
enables proper typechecking.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 077c3c9c432e..6671fcbb5293 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2424,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
  * The function finds extents of pages and scan them for all blocks.
  */
 static int __mpage_da_writepage(struct page *page,
-				struct writeback_control *wbc, void *data)
+				struct writeback_control *wbc,
+				struct mpage_da_data *mpd)
 {
-	struct mpage_da_data *mpd = data;
 	struct inode *inode = mpd->inode;
 	struct buffer_head *bh, *head;
 	sector_t logical;
-- 
cgit v1.2.3


From 5b41d92437f1ae19b3f3ffa3b16589fd5df50ac0 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:13 -0400
Subject: ext4: implement writeback livelock avoidance using page tagging

This is analogous to Jan Kara's commit,
f446daaea9d4a420d16c606f755f3689dcb2d0ce
mm: implement writeback livelock avoidance using page tagging

but since we forked write_cache_pages, we need to reimplement
it there (and in ext4_da_writepages, since range_cyclic handling
was moved to there)

If you start a large buffered IO to a file, and then set
fsync after it, you'll find that fsync does not complete
until the other IO stops.

If you continue re-dirtying the file (say, putting dd
with conv=notrunc in a loop), when fsync finally completes
(after all IO is done), it reports via tracing that
it has written many more pages than the file contains;
in other words it has synced and re-synced pages in
the file multiple times.

This then leads to problems with our writeback_index
update, since it advances it by pages written, and
essentially sets writeback_index off the end of the
file...

With the following patch, we only sync as much as was
dirty at the time of the sync.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c           | 18 +++++++++++++++---
 include/linux/writeback.h |  2 ++
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6671fcbb5293..c9ea95ba5fde 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2809,16 +2809,21 @@ static int write_cache_pages_da(struct address_space *mapping,
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	long nr_to_write = wbc->nr_to_write;
+	int tag;
 
 	pagevec_init(&pvec, 0);
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			      PAGECACHE_TAG_DIRTY,
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
@@ -2923,6 +2928,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	pgoff_t end;
 
 	trace_ext4_da_writepages(inode, wbc);
 
@@ -2958,8 +2964,11 @@ static int ext4_da_writepages(struct address_space *mapping,
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
 		wbc->range_end  = LLONG_MAX;
 		wbc->range_cyclic = 0;
-	} else
+		end = -1;
+	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	}
 
 	/*
 	 * This works around two forms of stupidity.  The first is in
@@ -3000,6 +3009,9 @@ static int ext4_da_writepages(struct address_space *mapping,
 	pages_skipped = wbc->pages_skipped;
 
 retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
+
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 72a5d647a5f2..3d132bfb4f3d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -143,6 +143,8 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 
 int generic_writepages(struct address_space *mapping,
 		       struct writeback_control *wbc);
+void tag_pages_for_writeback(struct address_space *mapping,
+			     pgoff_t start, pgoff_t end);
 int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
 		      void *data);
-- 
cgit v1.2.3


From 72f84e6560d18d60a091df27edf81409be6641cb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:13 -0400
Subject: ext4: update writeback_index based on last page scanned

As pointed out in a prior patch, updating the mapping's
writeback_index based on pages written isn't quite right;
what the writeback index is really supposed to reflect is
the next page which should be scanned for writeback during
periodic flush.

As in write_cache_pages(), write_cache_pages_da() does
this scanning for us as we assemble the mpd for later
writeout.  If we keep track of the next page after the
current scan, we can easily update writeback_index without
worrying about pages written vs. pages skipped, etc.

Without this, an fsync will reset writeback_index to
0 (its starting index) + however many pages it wrote, which
can mess up the progress of periodic flush.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c9ea95ba5fde..45fc5bdb7d67 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2800,12 +2800,13 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  */
 static int write_cache_pages_da(struct address_space *mapping,
 				struct writeback_control *wbc,
-				struct mpage_da_data *mpd)
+				struct mpage_da_data *mpd,
+				pgoff_t *done_index)
 {
 	int ret = 0;
 	int done = 0;
 	struct pagevec pvec;
-	int nr_pages;
+	unsigned nr_pages;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	long nr_to_write = wbc->nr_to_write;
@@ -2820,6 +2821,7 @@ static int write_cache_pages_da(struct address_space *mapping,
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
+	*done_index = index;
 	while (!done && (index <= end)) {
 		int i;
 
@@ -2843,6 +2845,8 @@ static int write_cache_pages_da(struct address_space *mapping,
 				break;
 			}
 
+			*done_index = page->index + 1;
+
 			lock_page(page);
 
 			/*
@@ -2928,6 +2932,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	pgoff_t done_index = 0;
 	pgoff_t end;
 
 	trace_ext4_da_writepages(inode, wbc);
@@ -3050,7 +3055,7 @@ retry:
 		mpd.io_done = 0;
 		mpd.pages_written = 0;
 		mpd.retval = 0;
-		ret = write_cache_pages_da(mapping, wbc, &mpd);
+		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
@@ -3104,14 +3109,13 @@ retry:
 			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
-	index += pages_written;
 	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = index;
+		mapping->writeback_index = done_index;
 
 out_writepages:
 	wbc->nr_to_write -= nr_to_writebump;
-- 
cgit v1.2.3


From 7f93cff90fa9be6ed45f6189e136153d1d8631b0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:13 -0400
Subject: ext4: fix kernel oops if the journal superblock has a non-zero
 j_errno

Commit 84061e0 fixed an accounting bug only to introduce the
possibility of a kernel OOPS if the journal has a non-zero j_errno
field indicating that the file system had detected a fs inconsistency.
After the journal replay, if the journal superblock indicates that the
file system has an error, this indication is transfered to the file
system and then ext4_commit_super() is called to write this to the
disk.

But since the percpu counters are now initialized after the journal
replay, the call to ext4_commit_super() will cause a kernel oops since
it needs to use the percpu counters the ext4 superblock structure.

The fix is to skip setting the ext4 free block and free inode fields
if the percpu counter has not been set.

Thanks to Ken Sumrall for reporting and analyzing the root causes of
this bug.

Addresses-Google-Bug: #3054080

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c                |  7 +++++--
 include/linux/percpu_counter.h | 10 ++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ce3b67b7269..c9e06c647ce8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3964,9 +3964,12 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	else
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+		ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeblocks_counter));
-	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+		es->s_free_inodes_count =
+			cpu_to_le32(percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeinodes_counter));
 	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8a7d510ffa9c..46f6ba56fa91 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -78,6 +78,11 @@ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 	return 1;
 }
 
+static inline int percpu_counter_initialized(struct percpu_counter *fbc)
+{
+	return (fbc->counters != NULL);
+}
+
 #else
 
 struct percpu_counter {
@@ -143,6 +148,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 	return percpu_counter_read(fbc);
 }
 
+static inline int percpu_counter_initialized(struct percpu_counter *fbc)
+{
+	return 1;
+}
+
 #endif	/* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
-- 
cgit v1.2.3


From 5dabfc78dcedbe46cb2e4872dde448de3cec2979 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:14 -0400
Subject: ext4: rename {exit,init}_ext4_*() to ext4_{exit,init}_*()

This is a cleanup to avoid namespace leaks out of fs/ext4

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/block_validity.c |  4 ++--
 fs/ext4/ext4.h           | 12 ++++++------
 fs/ext4/mballoc.c        |  4 ++--
 fs/ext4/page-io.c        |  4 ++--
 fs/ext4/super.c          | 32 ++++++++++++++++----------------
 fs/ext4/xattr.c          |  4 ++--
 fs/ext4/xattr.h          |  8 ++++----
 7 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 68bab70dd139..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,7 +29,7 @@ struct ext4_system_zone {
 
 static struct kmem_cache *ext4_system_zone_cachep;
 
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
 	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
 	if (ext4_system_zone_cachep == NULL)
@@ -37,7 +37,7 @@ int __init init_ext4_system_zone(void)
 	return 0;
 }
 
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
 	kmem_cache_destroy(ext4_system_zone_cachep);
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 98dde2b7dd6f..5d72c261d7e9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1684,8 +1684,8 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh, ext4_fsblk_t block,
 			     unsigned long count, int flags);
@@ -2040,8 +2040,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
 				 ext4_fsblk_t start_blk,
 				 unsigned int count);
@@ -2070,8 +2070,8 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 len, __u64 *moved_len);
 
 /* page-io.c */
-extern int __init init_ext4_pageio(void);
-extern void exit_ext4_pageio(void);
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e3bcc06b4906..381ac565786a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2705,7 +2705,7 @@ static void ext4_remove_debugfs_entry(void)
 
 #endif
 
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
 	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
 					SLAB_RECLAIM_ACCOUNT);
@@ -2730,7 +2730,7 @@ int __init init_ext4_mballoc(void)
 	return 0;
 }
 
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
 	int i;
 	/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b972ca50f851..46a7d6a9d976 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,7 +32,7 @@
 
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
 
-int __init init_ext4_pageio(void)
+int __init ext4_init_pageio(void)
 {
 	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
 	if (io_page_cachep == NULL)
@@ -46,7 +46,7 @@ int __init init_ext4_pageio(void)
 	return 0;
 }
 
-void exit_ext4_pageio(void)
+void ext4_exit_pageio(void)
 {
 	kmem_cache_destroy(io_end_cachep);
 	kmem_cache_destroy(io_page_cachep);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9e06c647ce8..94e60038e05d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4770,15 +4770,15 @@ out:
 	return ret;
 }
 
-static int __init init_ext4_fs(void)
+static int __init ext4_init_fs(void)
 {
 	int err;
 
 	ext4_check_flag_values();
-	err = init_ext4_pageio();
+	err = ext4_init_pageio();
 	if (err)
 		return err;
-	err = init_ext4_system_zone();
+	err = ext4_init_system_zone();
 	if (err)
 		goto out5;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
@@ -4788,11 +4788,11 @@ static int __init init_ext4_fs(void)
 
 	err = ext4_init_feat_adverts();
 
-	err = init_ext4_mballoc();
+	err = ext4_init_mballoc();
 	if (err)
 		goto out3;
 
-	err = init_ext4_xattr();
+	err = ext4_init_xattr();
 	if (err)
 		goto out2;
 	err = init_inodecache();
@@ -4812,37 +4812,37 @@ out:
 	unregister_as_ext3();
 	destroy_inodecache();
 out1:
-	exit_ext4_xattr();
+	ext4_exit_xattr();
 out2:
-	exit_ext4_mballoc();
+	ext4_exit_mballoc();
 out3:
 	kfree(ext4_feat);
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 out4:
-	exit_ext4_system_zone();
+	ext4_exit_system_zone();
 out5:
-	exit_ext4_pageio();
+	ext4_exit_pageio();
 	return err;
 }
 
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
 	ext4_destroy_lazyinit_thread();
 	unregister_as_ext2();
 	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
-	exit_ext4_xattr();
-	exit_ext4_mballoc();
+	ext4_exit_xattr();
+	ext4_exit_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
-	exit_ext4_system_zone();
-	exit_ext4_pageio();
+	ext4_exit_system_zone();
+	ext4_exit_pageio();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
-module_exit(exit_ext4_fs)
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..fa4b899da4b3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
 	ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
 	if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
 }
 
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 	if (ext4_xattr_cache)
 		mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..281dd8353652 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
 
-extern int init_ext4_xattr(void);
-extern void exit_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
+extern void ext4_exit_xattr(void);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
 
-static inline int
+static __init inline int
 init_ext4_xattr(void)
 {
 	return 0;
 }
 
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
 
-- 
cgit v1.2.3


From 1f109d5a17b438c4a54cbf6fd87a249e3d72fb21 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:14 -0400
Subject: ext4: make various ext4 functions be static

These functions have no need to be exported beyond file context.

No functions needed to be moved for this commit; just some function
declarations changed to be static and removed from header files.

(A similar patch was submitted by Eric Sandeen, but I wanted to handle
code movement in separate patches to make sure code changes didn't
accidentally get dropped.)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c       |  2 +-
 fs/ext4/ext4.h         |  8 --------
 fs/ext4/ext4_extents.h | 10 ----------
 fs/ext4/extents.c      | 36 ++++++++++++++++++------------------
 fs/ext4/ialloc.c       | 11 ++++++-----
 fs/ext4/inode.c        |  2 +-
 fs/ext4/super.c        |  2 +-
 7 files changed, 27 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..a12cefc20c76 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -489,7 +489,7 @@ error_return:
  * Check if filesystem has nblocks free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5d72c261d7e9..ac1afc148b36 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1558,8 +1558,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 			ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 
-extern struct proc_dir_entry *ext4_proc_root;
-
 /*
  * Timeout and state flag for lazy initialization inode thread.
  */
@@ -1623,7 +1621,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 				ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1667,10 +1664,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
 extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
 				 ext4_group_t group, int barrier);
@@ -1723,7 +1716,6 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..e427082e9ffc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -237,19 +237,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
 				      struct ext4_extent *ex1,
 				      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-				 struct ext4_ext_path *path,
-				 struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-							ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-						ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a1e20c8c4e0c..bd95375314ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -739,9 +739,9 @@ err:
  * insert new index [@logical;@ptr] into the block at @curp;
  * check where to insert: before @curp or after @curp
  */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *curp,
-				int logical, ext4_fsblk_t ptr)
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+				 struct ext4_ext_path *curp,
+				 int logical, ext4_fsblk_t ptr)
 {
 	struct ext4_extent_idx *ix;
 	int len, err;
@@ -1232,9 +1232,9 @@ out:
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
-			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_left(struct inode *inode,
+				struct ext4_ext_path *path,
+				ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
@@ -1297,9 +1297,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
-			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_extent_header *eh;
@@ -1585,9 +1585,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
  * 1 if they got merged.
  */
-int ext4_ext_try_to_merge(struct inode *inode,
-			  struct ext4_ext_path *path,
-			  struct ext4_extent *ex)
+static int ext4_ext_try_to_merge(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *ex)
 {
 	struct ext4_extent_header *eh;
 	unsigned int depth, len;
@@ -1632,9 +1632,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
-				    struct ext4_extent *newext,
-				    struct ext4_ext_path *path)
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
+					   struct ext4_extent *newext,
+					   struct ext4_ext_path *path)
 {
 	ext4_lblk_t b1, b2;
 	unsigned int depth, len1;
@@ -1845,9 +1845,9 @@ cleanup:
 	return err;
 }
 
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-			ext4_lblk_t num, ext_prepare_callback func,
-			void *cbdata)
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			       ext4_lblk_t num, ext_prepare_callback func,
+			       void *cbdata)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_ext_cache cbex;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 87d228aae6b0..9666e4c6efb4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-				ext4_group_t block_group,
-				struct ext4_group_desc *gdp)
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -414,8 +415,8 @@ struct orlov_stats {
  * for a particular block group or flex_bg.  If flex_size is 1, then g
  * is a block group number; otherwise it is flex_bg number.
  */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-		       int flex_size, struct orlov_stats *stats)
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+			    int flex_size, struct orlov_stats *stats)
 {
 	struct ext4_group_desc *desc;
 	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 45fc5bdb7d67..7a83c2793956 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5543,7 +5543,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94e60038e05d..158d1bca8769 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -53,7 +53,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
 
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 struct ext4_lazy_init *ext4_li_info;
 struct mutex ext4_li_mtx;
-- 
cgit v1.2.3


From bf89d16f6ef5389f1b9d8046e838ec87b9b3f8b9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:14 -0400
Subject: ext4: rename {ext,idx}_pblock and inline small extent functions

Cleanup namespace leaks from fs/ext4 and the inline trivial functions
ext4_{ext,idx}_pblock() and ext4_{ext,idx}_store_pblock() since the
code size actually shrinks when we make these functions inline,
they're so trivial.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_extents.h |  55 ++++++++++++++++-
 fs/ext4/extents.c      | 160 ++++++++++++++++++-------------------------------
 fs/ext4/migrate.c      |   2 +-
 fs/ext4/move_extent.c  |  22 +++----
 4 files changed, 121 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index e427082e9ffc..28ce70fd9cd0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
 	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
 
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ex->ee_start_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ix->ei_leaf_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+					 ext4_fsblk_t pb)
+{
+	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				      0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+					 ext4_fsblk_t pb)
+{
+	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				     0xffff);
+}
+
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
 					 sector_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 						   int num,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bd95375314ab..20e6c3c65fa1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
-
-/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-	ext4_fsblk_t block;
-
-	block = le32_to_cpu(ex->ee_start_lo);
-	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-	return block;
-}
-
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-	ext4_fsblk_t block;
-
-	block = le32_to_cpu(ix->ei_leaf_lo);
-	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-	return block;
-}
-
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 		/* try to predict block placement */
 		ex = path[depth].p_ext;
 		if (ex)
-			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+			return (ext4_ext_pblock(ex) +
+				(block - le32_to_cpu(ex->ee_block)));
 
 		/* it looks like index is empty;
 		 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext);
+	ext4_fsblk_t block = ext4_ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
 
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx);
+	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 	for (k = 0; k <= l; k++, path++) {
 		if (path->p_idx) {
 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-			    idx_pblock(path->p_idx));
+			    ext4_idx_pblock(path->p_idx));
 		} else if (path->p_ext) {
 			ext_debug("  %d:[%d]%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
 				  ext4_ext_is_uninitialized(path->p_ext),
 				  ext4_ext_get_actual_len(path->p_ext),
-				  ext_pblock(path->p_ext));
+				  ext4_ext_pblock(path->p_ext));
 		} else
 			ext_debug("  []");
 	}
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 			  ext4_ext_is_uninitialized(ex),
-			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
+			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 	}
 	ext_debug("\n");
 }
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
 
 	path->p_idx = l - 1;
 	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-		  idx_pblock(path->p_idx));
+		  ext4_idx_pblock(path->p_idx));
 
 #ifdef CHECK_BINSEARCH
 	{
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
 	path->p_ext = l - 1;
 	ext_debug("  -> %d:%llu:[%d]%d ",
 			le32_to_cpu(path->p_ext->ee_block),
-			ext_pblock(path->p_ext),
+			ext4_ext_pblock(path->p_ext),
 			ext4_ext_is_uninitialized(path->p_ext),
 			ext4_ext_get_actual_len(path->p_ext));
 
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
 		ext4_ext_binsearch_idx(inode, path + ppos, block);
-		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	ext4_ext_binsearch(inode, path + ppos, block);
 	/* if not an empty leaf */
 	if (path[ppos].p_ext)
-		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 				le32_to_cpu(path[depth].p_ext->ee_block),
-				ext_pblock(path[depth].p_ext),
+				ext4_ext_pblock(path[depth].p_ext),
 				ext4_ext_is_uninitialized(path[depth].p_ext),
 				ext4_ext_get_actual_len(path[depth].p_ext),
 				newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
 			ext_debug("%d: move %d:%llu in new index %llu\n", i,
 					le32_to_cpu(path[i].p_idx->ei_block),
-					idx_pblock(path[i].p_idx),
+					ext4_idx_pblock(path[i].p_idx),
 					newblock);
 			/*memmove(++fidx, path[i].p_idx++,
 					sizeof(struct ext4_extent_idx));
@@ -1146,7 +1098,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-		  idx_pblock(EXT_FIRST_INDEX(neh)));
+		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
 	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
 	err = ext4_ext_dirty(handle, inode, curp);
@@ -1286,7 +1238,7 @@ static int ext4_ext_search_left(struct inode *inode,
 	}
 
 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-	*phys = ext_pblock(ex) + ee_len - 1;
+	*phys = ext4_ext_pblock(ex) + ee_len - 1;
 	return 0;
 }
 
@@ -1342,7 +1294,7 @@ static int ext4_ext_search_right(struct inode *inode,
 			}
 		}
 		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext_pblock(ex);
+		*phys = ext4_ext_pblock(ex);
 		return 0;
 	}
 
@@ -1357,7 +1309,7 @@ static int ext4_ext_search_right(struct inode *inode,
 		/* next allocated block in this leaf */
 		ex++;
 		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext_pblock(ex);
+		*phys = ext4_ext_pblock(ex);
 		return 0;
 	}
 
@@ -1376,7 +1328,7 @@ got_index:
 	 * follow it and find the closest allocated
 	 * block to the right */
 	ix++;
-	block = idx_pblock(ix);
+	block = ext4_idx_pblock(ix);
 	while (++depth < path->p_depth) {
 		bh = sb_bread(inode->i_sb, block);
 		if (bh == NULL)
@@ -1388,7 +1340,7 @@ got_index:
 			return -EIO;
 		}
 		ix = EXT_FIRST_INDEX(eh);
-		block = idx_pblock(ix);
+		block = ext4_idx_pblock(ix);
 		put_bh(bh);
 	}
 
@@ -1402,7 +1354,7 @@ got_index:
 	}
 	ex = EXT_FIRST_EXTENT(eh);
 	*logical = le32_to_cpu(ex->ee_block);
-	*phys = ext_pblock(ex);
+	*phys = ext4_ext_pblock(ex);
 	put_bh(bh);
 	return 0;
 }
@@ -1573,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 		return 0;
 #endif
 
-	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
 		return 1;
 	return 0;
 }
@@ -1706,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-				ext4_ext_is_uninitialized(newext),
-				ext4_ext_get_actual_len(newext),
-				le32_to_cpu(ex->ee_block),
-				ext4_ext_is_uninitialized(ex),
-				ext4_ext_get_actual_len(ex), ext_pblock(ex));
+			  ext4_ext_is_uninitialized(newext),
+			  ext4_ext_get_actual_len(newext),
+			  le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_uninitialized(ex),
+			  ext4_ext_get_actual_len(ex),
+			  ext4_ext_pblock(ex));
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			return err;
@@ -1780,7 +1733,7 @@ has_space:
 		/* there is no extent in this leaf, create first one */
 		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
-				ext_pblock(newext),
+				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1747,7 @@ has_space:
 			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
 					"move %d from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
-					ext_pblock(newext),
+					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
 					ext4_ext_get_actual_len(newext),
 					nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1761,7 @@ has_space:
 		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
 				"move %d from 0x%p to 0x%p\n",
 				le32_to_cpu(newext->ee_block),
-				ext_pblock(newext),
+				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
 				nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1772,7 @@ has_space:
 	le16_add_cpu(&eh->eh_entries, 1);
 	nearex = path[depth].p_ext;
 	nearex->ee_block = newext->ee_block;
-	ext4_ext_store_pblock(nearex, ext_pblock(newext));
+	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
 	nearex->ee_len = newext->ee_len;
 
 merge:
@@ -1923,7 +1876,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 		} else {
 			cbex.ec_block = le32_to_cpu(ex->ee_block);
 			cbex.ec_len = ext4_ext_get_actual_len(ex);
-			cbex.ec_start = ext_pblock(ex);
+			cbex.ec_start = ext4_ext_pblock(ex);
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
 
@@ -2073,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 
 	/* free index block */
 	path--;
-	leaf = idx_pblock(path->p_idx);
+	leaf = ext4_idx_pblock(path->p_idx);
 	if (unlikely(path->p_hdr->eh_entries == 0)) {
 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
 		return -EIO;
@@ -2181,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t start;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		start = ext_pblock(ex) + ee_len - num;
+		start = ext4_ext_pblock(ex) + ee_len - num;
 		ext_debug("free last %u blocks starting %llu\n", num, start);
 		ext4_free_blocks(handle, inode, 0, start, num, flags);
 	} else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			goto out;
 
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
-				ext_pblock(ex));
+				ext4_ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
 		ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2374,9 @@ again:
 			struct buffer_head *bh;
 			/* go to the next level */
 			ext_debug("move to level %d (block %llu)\n",
-				  i + 1, idx_pblock(path[i].p_idx));
+				  i + 1, ext4_idx_pblock(path[i].p_idx));
 			memset(path + i + 1, 0, sizeof(*path));
-			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
 			if (!bh) {
 				/* should we reset i_size? */
 				err = -EIO;
@@ -2543,7 +2496,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 	int ret;
 
 	ee_len    = ext4_ext_get_actual_len(ex);
-	ee_pblock = ext_pblock(ex);
+	ee_pblock = ext4_ext_pblock(ex);
 
 	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len,
 			       GFP_NOFS, BLKDEV_IFL_WAIT);
@@ -2596,12 +2549,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
-	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
-	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
 	/*
 	 * It is safe to convert extent to initialized via explicit
@@ -2620,7 +2573,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 		ext4_ext_dirty(handle, inode, path + depth);
 		/* zeroed the full extent */
 		return allocated;
@@ -2655,7 +2608,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = cpu_to_le16(ee_len - allocated);
 			ext4_ext_mark_uninitialized(ex);
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 
 			ex3 = &newex;
@@ -2670,7 +2623,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					goto fix_extent_len;
 				ex->ee_block = orig_ex.ee_block;
 				ex->ee_len   = orig_ex.ee_len;
-				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+				ext4_ext_store_pblock(ex,
+					ext4_ext_pblock(&orig_ex));
 				ext4_ext_dirty(handle, inode, path + depth);
 				/* blocks available from map->m_lblk */
 				return allocated;
@@ -2727,7 +2681,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
 			/* blocks available from map->m_lblk */
@@ -2778,7 +2732,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zero out the first half */
 			/* blocks available from map->m_lblk */
@@ -2847,7 +2801,7 @@ insert:
 		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 		ext4_ext_dirty(handle, inode, path + depth);
 		/* zero out the first half */
 		return allocated;
@@ -2860,7 +2814,7 @@ out:
 fix_extent_len:
 	ex->ee_block = orig_ex.ee_block;
 	ex->ee_len   = orig_ex.ee_len;
-	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 	ext4_ext_mark_uninitialized(ex);
 	ext4_ext_dirty(handle, inode, path + depth);
 	return err;
@@ -2918,12 +2872,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
-	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
-	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
 	/*
 	 * It is safe to convert extent to initialized via explicit
@@ -2972,7 +2926,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
 			/* blocks available from map->m_lblk */
@@ -3044,7 +2998,7 @@ insert:
 		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 		ext4_ext_dirty(handle, inode, path + depth);
 		/* zero out the first half */
 		return allocated;
@@ -3057,7 +3011,7 @@ out:
 fix_extent_len:
 	ex->ee_block = orig_ex.ee_block;
 	ex->ee_len   = orig_ex.ee_len;
-	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 	ext4_ext_mark_uninitialized(ex);
 	ext4_ext_dirty(handle, inode, path + depth);
 	return err;
@@ -3347,7 +3301,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			/* block is already allocated */
 			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
-				   + ext_pblock(&newex);
+				   + ext4_ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
 			allocated = ext4_ext_get_actual_len(&newex) -
 				(map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3385,7 +3339,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ex = path[depth].p_ext;
 	if (ex) {
 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-		ext4_fsblk_t ee_start = ext_pblock(ex);
+		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
 		unsigned short ee_len;
 
 		/*
@@ -3513,13 +3467,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
 		ext4_discard_preallocations(inode);
-		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+		ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
 				 ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
 	}
 
 	/* previous routine could use block we allocated */
-	newblock = ext_pblock(&newex);
+	newblock = ext4_ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 	if (allocated > map->m_len)
 		allocated = map->m_len;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..25f3a974b725 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 	struct buffer_head *bh;
 	struct ext4_extent_header *eh;
 
-	block = idx_pblock(ix);
+	block = ext4_idx_pblock(ix);
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh)
 		return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..b9f3e7862f13 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
 		/* leaf block */
 		*extent = ++path[ppos].p_ext;
-		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 		return 0;
 	}
 
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 
 			/* index block */
 			path[ppos].p_idx++;
-			path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 			if (path[ppos+1].p_bh)
 				brelse(path[ppos+1].p_bh);
 			path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 				path[cur_ppos].p_idx =
 					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
 				path[cur_ppos].p_block =
-					idx_pblock(path[cur_ppos].p_idx);
+					ext4_idx_pblock(path[cur_ppos].p_idx);
 				if (path[cur_ppos+1].p_bh)
 					brelse(path[cur_ppos+1].p_bh);
 				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 			path[leaf_ppos].p_ext = *extent =
 				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
 			path[leaf_ppos].p_block =
-					ext_pblock(path[leaf_ppos].p_ext);
+					ext4_ext_pblock(path[leaf_ppos].p_ext);
 			return 0;
 		}
 	}
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 			 */
 			o_end->ee_block = end_ext->ee_block;
 			o_end->ee_len = end_ext->ee_len;
-			ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 		}
 
 		o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		 */
 		o_end->ee_block = end_ext->ee_block;
 		o_end->ee_len = end_ext->ee_len;
-		ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 
 		/*
 		 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
 	/* Insert new entry */
 	if (new_ext->ee_len) {
 		o_start[i] = *new_ext;
-		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
 	}
 
 	/* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	start_ext.ee_len = end_ext.ee_len = 0;
 
 	new_ext.ee_block = cpu_to_le32(*from);
-	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
 	new_ext.ee_len = dext->ee_len;
 	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
 	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 		copy_extent_status(oext, &end_ext);
 		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
 		ext4_ext_store_pblock(&end_ext,
-			(ext_pblock(o_end) + oext_alen - end_ext_alen));
+			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
 		end_ext.ee_block =
 			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
 			oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 	/* When tmp_dext is too large, pick up the target range. */
 	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
 
-	ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
 	tmp_dext->ee_block =
 			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
 	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 		tmp_dext->ee_len = cpu_to_le16(max_count);
 
 	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-	ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
 
 	/* Adjust extent length if donor extent is larger than orig */
 	if (ext4_ext_get_actual_len(tmp_dext) >
-- 
cgit v1.2.3


From 4a873a472b3bbcfd425d7ae210afdec28c04e2e5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:14 -0400
Subject: ext4: move flush_completed_IO to fs/ext4/fsync.c and make it static

Fix a namespace leak by moving the function to the file where it is
used and making it static.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 -
 fs/ext4/fsync.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/inode.c | 83 ---------------------------------------------------------
 3 files changed, 83 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ac1afc148b36..c0570a68a2bc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1721,7 +1721,6 @@ extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 /* ioctl.c */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..1c701f635961 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
 
 #include <trace/events/ext4.h>
 
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef	EXT4_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+	unsigned long flags;
+
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+		return;
+	}
+
+	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+static int flush_completed_IO(struct inode *inode)
+{
+	ext4_io_end_t *io;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long flags;
+	int ret = 0;
+	int ret2 = 0;
+
+	if (list_empty(&ei->i_completed_io_list))
+		return ret;
+
+	dump_completed_IO(inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	while (!list_empty(&ei->i_completed_io_list)){
+		io = list_entry(ei->i_completed_io_list.next,
+				ext4_io_end_t, list);
+		/*
+		 * Calling ext4_end_io_nolock() to convert completed
+		 * IO to written.
+		 *
+		 * When ext4_sync_file() is called, run_queue() may already
+		 * about to flush the work corresponding to this io structure.
+		 * It will be upset if it founds the io structure related
+		 * to the work-to-be schedule is freed.
+		 *
+		 * Thus we need to keep the io structure still valid here after
+		 * convertion finished. The io structure has a flag to
+		 * avoid double converting from both fsync and background work
+		 * queue work.
+		 */
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		ret = ext4_end_io_nolock(io);
+		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+		if (ret < 0)
+			ret2 = ret;
+		else
+			list_del_init(&io->list);
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	return (ret2 < 0) ? ret2 : 0;
+}
+
 /*
  * If we're not journaling and this is a just-created file, we have to
  * sync our parent directory (if it was freshly created) since
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7a83c2793956..9e60d0b8fa75 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3626,89 +3626,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef	EXT4_DEBUG
-	struct list_head *cur, *before, *after;
-	ext4_io_end_t *io, *io0, *io1;
-	unsigned long flags;
-
-	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-		return;
-	}
-
-	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-		cur = &io->list;
-		before = cur->prev;
-		io0 = container_of(before, ext4_io_end_t, list);
-		after = cur->next;
-		io1 = container_of(after, ext4_io_end_t, list);
-
-		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-			    io, inode->i_ino, io0, io1);
-	}
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-	ext4_io_end_t *io;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long flags;
-	int ret = 0;
-	int ret2 = 0;
-
-	if (list_empty(&ei->i_completed_io_list))
-		return ret;
-
-	dump_completed_IO(inode);
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	while (!list_empty(&ei->i_completed_io_list)){
-		io = list_entry(ei->i_completed_io_list.next,
-				ext4_io_end_t, list);
-		/*
-		 * Calling ext4_end_io_nolock() to convert completed
-		 * IO to written.
-		 *
-		 * When ext4_sync_file() is called, run_queue() may already
-		 * about to flush the work corresponding to this io structure.
-		 * It will be upset if it founds the io structure related
-		 * to the work-to-be schedule is freed.
-		 *
-		 * Thus we need to keep the io structure still valid here after
-		 * convertion finished. The io structure has a flag to
-		 * avoid double converting from both fsync and background work
-		 * queue work.
-		 */
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-		ret = ext4_end_io_nolock(io);
-		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-		if (ret < 0)
-			ret2 = ret;
-		else
-			list_del_init(&io->list);
-	}
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-	return (ret2 < 0) ? ret2 : 0;
-}
-
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private, int ret,
 			    bool is_async)
-- 
cgit v1.2.3


From 61d08673de1fe68bfba86203258377bf39f234b6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Oct 2010 21:30:15 -0400
Subject: ext4: rename mark_bitmap_end() to ext4_mark_bitmap_end()

Fix a namespace leak from fs/ext4

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 3 ++-
 fs/ext4/ext4.h   | 2 +-
 fs/ext4/ialloc.c | 4 ++--
 fs/ext4/resize.c | 7 ++++---
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a12cefc20c76..14c3af26c671 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * less than the blocksize * 8 ( which is the size
 		 * of bitmap ), set rest of the block bitmap to 1
 		 */
-		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+		ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+				     bh->b_data);
 	}
 	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c0570a68a2bc..202668c5607d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1664,7 +1664,7 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
 				 ext4_group_t group, int barrier);
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9666e4c6efb4..509f429f71e8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
  * need to use it within a single byte (to ensure we get endianness right).
  * We can use memset for the rest of the bitmap as there are no other users.
  */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
 	int i;
 
@@ -86,7 +86,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
 
 	return EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 2f5e347de48b..f398474e2784 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -252,7 +252,8 @@ static int setup_new_group_blocks(struct super_block *sb,
 	if ((err = extend_or_restart_transaction(handle, 2, bh)))
 		goto exit_bh;
 
-	mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+	ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+			     bh->b_data);
 	ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
 	/* Mark unused entries in inode bitmap used */
@@ -263,8 +264,8 @@ static int setup_new_group_blocks(struct super_block *sb,
 		goto exit_journal;
 	}
 
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-			bh->b_data);
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+			     bh->b_data);
 	ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
 	brelse(bh);
-- 
cgit v1.2.3


From eee4adc709afe40d8c02fa154c63dbeb55d911e3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 27 Oct 2010 21:30:15 -0400
Subject: ext4: move ext4_mb_{get,put}_buddy_cache_lock and make them static

These functions are only used within fs/ext4/mballoc.c, so move them
so they are used after they are defined, and then make them be static.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   3 --
 fs/ext4/mballoc.c | 157 +++++++++++++++++++++++++++---------------------------
 2 files changed, 79 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 202668c5607d..8b5dd6369f82 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1684,9 +1684,6 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
-						ext4_group_t, int);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 
 /* inode.c */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 381ac565786a..328ea9cec57b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -946,6 +946,85 @@ out:
 	return err;
 }
 
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+					ext4_group_t group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	int groups_per_page;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
+
+		if ((first_group + i) >= ngroups)
+			break;
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		down_write_nested(&grp->alloc_sem, i);
+	}
+	return i;
+}
+
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+					 ext4_group_t group, int locked_group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+	/* release locks on all the groups */
+	for (i = 0; i < locked_group; i++) {
+
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		up_write(&grp->alloc_sem);
+	}
+
+}
+
 /*
  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
  * block group lock of all groups for this page; do not hold the BG lock when
@@ -1923,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-	int i;
-	int block, pnum;
-	int blocks_per_page;
-	int groups_per_page;
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	ext4_group_t first_group;
-	struct ext4_group_info *grp;
-
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-
-	groups_per_page = blocks_per_page >> 1;
-	if (groups_per_page == 0)
-		groups_per_page = 1;
-	/* read all groups the page covers into the cache */
-	for (i = 0; i < groups_per_page; i++) {
-
-		if ((first_group + i) >= ngroups)
-			break;
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		down_write_nested(&grp->alloc_sem, i);
-	}
-	return i;
-}
-
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-					ext4_group_t group, int locked_group)
-{
-	int i;
-	int block, pnum;
-	int blocks_per_page;
-	ext4_group_t first_group;
-	struct ext4_group_info *grp;
-
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-	/* release locks on all the groups */
-	for (i = 0; i < locked_group; i++) {
-
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		up_write(&grp->alloc_sem);
-	}
-
-}
-
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-- 
cgit v1.2.3


From a6371b636f9f007ee5c90f85de048bc1b675424a Mon Sep 17 00:00:00 2001
From: Kazuya Mio <k-mio@sx.jp.nec.com>
Date: Wed, 27 Oct 2010 21:30:15 -0400
Subject: ext4: fix compile error in ext4_fallocate()

When I compiled 2.6.36-rc3 kernel with EXT4FS_DEBUG definition, I got
the following compile error.

  CC [M]  fs/ext4/extents.o
fs/ext4/extents.c: In function 'ext4_fallocate':
fs/ext4/extents.c:3772: error: 'block' undeclared (first use in this function)
fs/ext4/extents.c:3772: error: (Each undeclared identifier is reported only once
fs/ext4/extents.c:3772: error: for each function it appears in.)
make[2]: *** [fs/ext4/extents.o] Error 1

The patch fixes this problem.

Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 20e6c3c65fa1..a17a676a3106 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3668,7 +3668,7 @@ retry:
 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
-				    inode->i_ino, block, max_blocks);
+				    inode->i_ino, map.m_lblk, max_blocks);
 #endif
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
-- 
cgit v1.2.3


From beed5ecbaa377fa8bb6a54a6608e8725a21efdbc Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 27 Oct 2010 22:08:42 -0400
Subject: ext4: fix unbalanced mutex unlock in error path of
 ext4_li_request_new

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 158d1bca8769..3b4984d37a68 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2902,28 +2902,26 @@ static int ext4_register_li_request(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_li_request *elr;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-	int ret = 0;
+	int ret;
 
 	if (sbi->s_li_request != NULL)
-		goto out;
+		return 0;
 
 	if (first_not_zeroed == ngroups ||
 	    (sb->s_flags & MS_RDONLY) ||
 	    !test_opt(sb, INIT_INODE_TABLE)) {
 		sbi->s_li_request = NULL;
-		goto out;
+		return 0;
 	}
 
 	if (first_not_zeroed == ngroups) {
 		sbi->s_li_request = NULL;
-		goto out;
+		return 0;
 	}
 
 	elr = ext4_li_request_new(sb, first_not_zeroed);
-	if (!elr) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!elr)
+		return -ENOMEM;
 
 	mutex_lock(&ext4_li_mtx);
 
@@ -2944,14 +2942,10 @@ static int ext4_register_li_request(struct super_block *sb,
 		if (ret)
 			goto out;
 	}
-
-	mutex_unlock(&ext4_li_mtx);
-
 out:
-	if (ret) {
-		mutex_unlock(&ext4_li_mtx);
+	mutex_unlock(&ext4_li_mtx);
+	if (ret)
 		kfree(elr);
-	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3d287de3b828226e5a450c7fd5bf4283792dc2b0 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 27 Oct 2010 22:08:46 -0400
Subject: ext4: optimize orphan_list handling for ext4_setattr

Surprisingly chown() on ext4 is not SMP scalable operation.
Due to unconditional orphan_del(NULL, inode) in ext4_setattr()
result in significant performance overhead because of global orphan
mutex, especially in no-journal mode (where orphan_add() is noop).
It is possible to skip explicit orphan_del if possible.
Results of fchown() micro-benchmark in no-journal mode
while (1) {
   iteration++;
   fchown(fd, uid, gid);
   fchown(fd, uid + 1, gid + 1)
}
measured: iterations per millisecond
| nr_tasks | w/o patch | with patch |
|        1 |       142 |        185 |
|        4 |       109 |        642 |

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9e60d0b8fa75..3ba237b0b2aa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5281,6 +5281,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
+	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 
 	error = inode_change_ok(inode, attr);
@@ -5336,8 +5337,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-
-		error = ext4_orphan_add(handle, inode);
+		if (ext4_handle_valid(handle)) {
+			error = ext4_orphan_add(handle, inode);
+			orphan = 1;
+		}
 		EXT4_I(inode)->i_disksize = attr->ia_size;
 		rc = ext4_mark_inode_dirty(handle, inode);
 		if (!error)
@@ -5355,6 +5358,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 					goto err_out;
 				}
 				ext4_orphan_del(handle, inode);
+				orphan = 0;
 				ext4_journal_stop(handle);
 				goto err_out;
 			}
@@ -5377,7 +5381,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	 * If the call to ext4_truncate failed to get a transaction handle at
 	 * all, we need to clean up the in-core orphan list manually.
 	 */
-	if (inode->i_nlink)
+	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 
 	if (!rc && (ia_valid & ATTR_MODE))
-- 
cgit v1.2.3


From 57ee047b4d6bb4bcc74be0329441d1b242e57e61 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 4 Aug 2010 16:29:03 +0000
Subject: 9p: remove unneeded checks

git_t is unsigned an can never be less than zero.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index ef5905f7c8a3..88efc161743b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -886,10 +886,6 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	gid = v9fs_get_fsgid_for_create(dir);
-	if (gid < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
-		goto error;
-	}
 
 	name = (char *) dentry->d_name.name;
 	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
@@ -1616,11 +1612,6 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 
 	gid = v9fs_get_fsgid_for_create(dir);
 
-	if (gid < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
-		goto error;
-	}
-
 	/* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
 	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
 
@@ -1885,10 +1876,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
 	}
 
 	gid = v9fs_get_fsgid_for_create(dir);
-	if (gid < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
-		goto error;
-	}
 
 	name = (char *) dentry->d_name.name;
 
-- 
cgit v1.2.3


From 3834b12a18d51d6c535ea52e16355d75806ffe38 Mon Sep 17 00:00:00 2001
From: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
Date: Tue, 3 Aug 2010 11:55:40 +0000
Subject: fs/9p: setrlimit fix for 9p write

Current 9p client file write code does not check for RLIMIT_FSIZE resource.
This bug was found by running LTP test case for setrlimit. This bug is fixed
by calling generic_write_checks before sending the write request to the
server.
Without this patch: the write function is allowed to write above the
RLIMIT_FSIZE set by user.
With this patch: the write function checks for RLIMIT_SIZE and writes upto
the size limit.

Signed-off-by: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e97c92bd6f16..89c44e92022c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -219,7 +219,9 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
 		size_t count, loff_t * offset)
 {
-	int n, rsize, total = 0;
+	ssize_t retval;
+	size_t total = 0;
+	int n, rsize;
 	struct p9_fid *fid;
 	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -234,6 +236,17 @@ v9fs_file_write(struct file *filp, const char __user * data,
 
 	rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
 
+	retval = generic_write_checks(filp, &origin, &count, 0);
+	if (retval)
+		goto out;
+
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
+		goto out;
+	retval = 0;
+	if (!count)
+		goto out;
+
 	do {
 		if (count < rsize)
 			rsize = count;
@@ -258,9 +271,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	}
 
 	if (n < 0)
-		return n;
-
-	return total;
+		retval = n;
+	else
+		retval = total;
+out:
+	return retval;
 }
 
 static int v9fs_file_fsync(struct file *filp, int datasync)
-- 
cgit v1.2.3


From 8812a3d5f873e28cd08ec8afe328c4182b72db49 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Mon, 9 Aug 2010 11:33:10 +0000
Subject: 9p: Pass the correct end of buffer to p9dirent_read

A patch was accepted recently for sending correct buffer size to p9stat_read.
We need a similar patch in v9fs_dir_readdir_dotl to send correct end of buffer
to p9dirent_read.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 899f168fd19c..170f5bb8ebe0 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 		while (rdir->head < rdir->tail) {
 
 			err = p9dirent_read(rdir->buf + rdir->head,
-						buflen - rdir->head, &curdirent,
+						rdir->tail - rdir->head,
+						&curdirent,
 						fid->clnt->proto_version);
 			if (err < 0) {
 				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
-- 
cgit v1.2.3


From 7c7298cffc8f4417c95117c2a432f962c066499d Mon Sep 17 00:00:00 2001
From: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
Date: Wed, 18 Aug 2010 06:01:52 +0000
Subject: fs/9p: mkdir fix for setting S_ISGID bit as per parent directory

The current implementation of 9p client mkdir function does not
set the S_ISGID mode bit for the directory being created if the
parent directory has this bit set. This patch fixes this problem
so that the newly created directory inherits the gid from parent
directory and not from the process creating this directory, when
the S_ISGID bit is set in parent directory.

Signed-off-by: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 88efc161743b..c3d200d41dca 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -876,6 +876,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
 	v9ses = v9fs_inode2v9ses(dir);
 
 	mode |= S_IFDIR;
+	if (dir->i_mode & S_ISGID)
+		mode |= S_ISGID;
 	dir_dentry = v9fs_dentry_from_dir_inode(dir);
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid)) {
-- 
cgit v1.2.3


From 3e24ad2ff9d477f949acd0982cf12e58812210cb Mon Sep 17 00:00:00 2001
From: jvrao <jvrao@linux.vnet.ibm.com>
Date: Tue, 24 Aug 2010 15:43:28 +0000
Subject: 9p: Add a Direct IO support for non-cached operations.

The presence of v9fs_direct_IO() in the address space ops vector
allowes open() O_DIRECT flags which would have failed otherwise.

In the non-cached mode, we shunt off direct read and write requests before
the VFS gets them, so this method should never be called.

Direct IO is not 'yet' supported in the cached mode. Hence when
this routine is called through generic_file_aio_read(), the read/write fails
with an error.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_addr.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b3..b7f2a8e3863e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
 	return 0;
 }
 
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+		loff_t pos, unsigned long nr_segs)
+{
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+			"off/no(%lld/%lu) EINVAL\n",
+			iocb->ki_filp->f_path.dentry->d_name.name,
+			(long long) pos, nr_segs);
+
+	return -EINVAL;
+}
 const struct address_space_operations v9fs_addr_operations = {
       .readpage = v9fs_vfs_readpage,
       .readpages = v9fs_vfs_readpages,
       .releasepage = v9fs_release_page,
       .invalidatepage = v9fs_invalidate_page,
       .launder_page = v9fs_launder_page,
+      .direct_IO = v9fs_direct_IO,
 };
-- 
cgit v1.2.3


From 8d40fa2492eb3dcf02468eef2f6bba450be42b22 Mon Sep 17 00:00:00 2001
From: jvrao <jvrao@linux.vnet.ibm.com>
Date: Mon, 30 Aug 2010 13:23:20 -0500
Subject: fs/9p: Remove the redundant rsize calculation in v9fs_file_write()

the same calculation is done in p9_client_write

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 89c44e92022c..f455c45a8c5f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -221,7 +221,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 {
 	ssize_t retval;
 	size_t total = 0;
-	int n, rsize;
+	int n;
 	struct p9_fid *fid;
 	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -234,8 +234,6 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	fid = filp->private_data;
 	clnt = fid->clnt;
 
-	rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
-
 	retval = generic_write_checks(filp, &origin, &count, 0);
 	if (retval)
 		goto out;
@@ -248,11 +246,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		goto out;
 
 	do {
-		if (count < rsize)
-			rsize = count;
-
-		n = p9_client_write(fid, NULL, data+total, origin+total,
-									rsize);
+		n = p9_client_write(fid, NULL, data+total, origin+total, count);
 		if (n <= 0)
 			break;
 		count -= n;
-- 
cgit v1.2.3


From 85ff872d3f4a62d076d698bd1fa15ca2a4d7c100 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:39 +0530
Subject: fs/9p: Implement POSIX ACL permission checking function

The ACL value is fetched as a part of inode initialization
from the server and the permission checking function use the
cached value of the ACL

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/Kconfig     | 13 ++++++++
 fs/9p/Makefile    |  1 +
 fs/9p/acl.c       | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/acl.h       | 27 ++++++++++++++++
 fs/9p/vfs_inode.c | 10 +++++-
 fs/9p/vfs_super.c |  9 +++++-
 fs/9p/xattr.c     | 48 ++++++++++++++++------------
 fs/9p/xattr.h     |  4 +++
 8 files changed, 186 insertions(+), 22 deletions(-)
 create mode 100644 fs/9p/acl.c
 create mode 100644 fs/9p/acl.h

(limited to 'fs')

diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4e..7e0511476797 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -17,3 +17,16 @@ config 9P_FSCACHE
 	  Choose Y here to enable persistent, read-only local
 	  caching support for 9p clients using FS-Cache
 
+
+config 9P_FS_POSIX_ACL
+	bool "9P POSIX Access Control Lists"
+	depends on 9P_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 91fba025fcbe..f8ba37effd1b 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_9P_FS) := 9p.o
 	xattr_user.o
 
 9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 000000000000..a8c013615a22
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+	ssize_t size;
+	void *value = NULL;
+	struct posix_acl *acl = NULL;;
+
+	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = v9fs_fid_xattr_get(fid, name, value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			if (IS_ERR(acl))
+				goto err_out;
+		}
+	} else if (size == -ENODATA || size == 0 ||
+		   size == -ENOSYS || size == -EOPNOTSUPP) {
+		acl = NULL;
+	} else
+		acl = ERR_PTR(-EIO);
+
+err_out:
+	kfree(value);
+	return acl;
+}
+
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	int retval = 0;
+	struct posix_acl *pacl, *dacl;
+
+	/* get the default/access acl values and cache them */
+	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+
+	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+		posix_acl_release(dacl);
+		posix_acl_release(pacl);
+	} else
+		retval = -EIO;
+
+	return retval;
+}
+
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	/*
+	 * 9p Always cache the acl value when
+	 * instantiating the inode (v9fs_inode_from_fid)
+	 */
+	acl = get_cached_acl(inode, type);
+	BUG_ON(acl == ACL_NOT_CACHED);
+	return acl;
+}
+
+int v9fs_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+	return -EAGAIN;
+}
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 000000000000..b1414f7c82e0
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern int v9fs_check_acl(struct inode *inode, int mask);
+#else
+#define v9fs_check_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	return 0;
+}
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c3d200d41dca..1249b8323961 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -44,6 +45,7 @@
 #include "fid.h"
 #include "cache.h"
 #include "xattr.h"
+#include "acl.h"
 
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -500,6 +502,11 @@ v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	v9fs_vcookie_set_qid(ret, &st->qid);
 	v9fs_cache_inode_get_cookie(ret);
 #endif
+	err = v9fs_get_acl(ret, fid);
+	if (err) {
+		iput(ret);
+		goto error;
+	}
 	kfree(st);
 	return ret;
 error:
@@ -1959,7 +1966,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.getxattr = generic_getxattr,
 	.removexattr = generic_removexattr,
 	.listxattr = v9fs_listxattr,
-
+	.check_acl = v9fs_check_acl,
 };
 
 static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1986,6 +1993,7 @@ static const struct inode_operations v9fs_file_inode_operations_dotl = {
 	.getxattr = generic_getxattr,
 	.removexattr = generic_removexattr,
 	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
 };
 
 static const struct inode_operations v9fs_symlink_inode_operations = {
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d12ba0ed3db..14da5778d44e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -46,6 +46,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "xattr.h"
+#include "acl.h"
 
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 
@@ -88,6 +89,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
 	    MS_NOATIME;
 
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+
 	save_mount_options(sb, data);
 }
 
@@ -149,7 +154,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 		goto release_sb;
 	}
 	sb->s_root = root;
-
 	if (v9fs_proto_dotl(v9ses)) {
 		struct p9_stat_dotl *st = NULL;
 		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
@@ -174,6 +178,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 		p9stat_free(st);
 		kfree(st);
 	}
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto release_sb;
 
 	v9fs_fid_add(root, fid);
 
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f88e5c2dc873..67ac6cfc1a07 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -21,30 +21,13 @@
 #include "fid.h"
 #include "xattr.h"
 
-/*
- * v9fs_xattr_get()
- *
- * Copy an extended attribute into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
-		       void *buffer, size_t buffer_size)
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+			   void *buffer, size_t buffer_size)
 {
 	ssize_t retval;
 	int msize, read_count;
 	u64 offset = 0, attr_size;
-	struct p9_fid *fid, *attr_fid;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
-		__func__, name, buffer_size);
-
-	fid = v9fs_fid_lookup(dentry);
-	if (IS_ERR(fid))
-		return PTR_ERR(fid);
+	struct p9_fid *attr_fid;
 
 	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
 	if (IS_ERR(attr_fid)) {
@@ -88,6 +71,31 @@ error:
 
 }
 
+
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t buffer_size)
+{
+	struct p9_fid *fid;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+		__func__, name, buffer_size);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
+
 /*
  * v9fs_xattr_set()
  *
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 9ddf672ae5c4..ec908c693342 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -15,10 +15,14 @@
 #define FS_9P_XATTR_H
 
 #include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
 
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
 
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+				  void *, size_t);
 extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
 			      void *, size_t);
 extern int v9fs_xattr_set(struct dentry *, const char *,
-- 
cgit v1.2.3


From 7a4566b0b8fa67c7cd7be9f2969a085920356abb Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:39 +0530
Subject: fs/9p: Add xattr callbacks for POSIX ACL

This patch implement fetching POSIX ACL from the server

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c   | 41 +++++++++++++++++++++++++++++++++++++++++
 fs/9p/xattr.c |  4 ++++
 fs/9p/xattr.h |  2 ++
 3 files changed, 47 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index a8c013615a22..4293b4599ff6 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -94,3 +94,44 @@ int v9fs_check_acl(struct inode *inode, int mask)
 	}
 	return -EAGAIN;
 }
+
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+			      void *buffer, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	acl = v9fs_get_cached_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	return 0;
+}
+
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
+
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 67ac6cfc1a07..43ec7df84336 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -164,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 const struct xattr_handler *v9fs_xattr_handlers[] = {
 	&v9fs_xattr_user_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	&v9fs_xattr_acl_access_handler,
+	&v9fs_xattr_acl_default_handler,
+#endif
 	NULL
 };
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index ec908c693342..eaa837c53bd5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
 
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
 
 extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
 				  void *, size_t);
-- 
cgit v1.2.3


From 22d8dcdf8f8a3882d98757e78169014bb0bc6b23 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:40 +0530
Subject: fs/9p: Implement setting posix acl

This patch also update mode bits, as a normal file system.
I am not sure wether we should do that, considering that
a setxattr on the server will again update the ACL/mode value

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c       | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/9p/v9fs_vfs.h  |  1 +
 fs/9p/vfs_inode.c |  2 +-
 3 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 4293b4599ff6..cad38bc1710e 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -17,9 +17,11 @@
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
+#include "v9fs_vfs.h"
 
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -119,7 +121,77 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 			      const void *value, size_t size,
 			      int flags, int type)
 {
-	return 0;
+	int retval;
+	struct posix_acl *acl;
+	struct inode *inode = dentry->d_inode;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!is_owner_or_cap(inode))
+		return -EPERM;
+	if (value) {
+		/* update the cached acl value */
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			retval = posix_acl_valid(acl);
+			if (retval)
+				goto err_out;
+		}
+	} else
+		acl = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			retval = posix_acl_equiv_mode(acl, &mode);
+			if (retval < 0)
+				goto err_out;
+			else {
+				struct iattr iattr;
+				if (retval == 0) {
+					/*
+					 * ACL can be represented
+					 * by the mode bits. So don't
+					 * update ACL.
+					 */
+					acl = NULL;
+					value = NULL;
+					size = 0;
+				}
+				/* Updte the mode bits */
+				iattr.ia_mode = ((mode & S_IALLUGO) |
+						 (inode->i_mode & ~S_IALLUGO));
+				iattr.ia_valid = ATTR_MODE;
+				/* FIXME should we update ctime ?
+				 * What is the following setxattr update the
+				 * mode ?
+				 */
+				v9fs_vfs_setattr_dotl(dentry, &iattr);
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode)) {
+			retval = -EINVAL;
+			goto err_out;
+		}
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, value, size, flags);
+	if (!retval)
+		set_cached_acl(inode, type, acl);
+err_out:
+	posix_acl_release(acl);
+	return retval;
 }
 
 const struct xattr_handler v9fs_xattr_acl_access_handler = {
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 88418c419ea7..09861295eef9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -64,3 +64,4 @@ int v9fs_uflags2omode(int uflags, int extended);
 
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 1249b8323961..4b67bf1fb1d5 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1242,7 +1242,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
  *
  */
 
-static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 {
 	int retval;
 	struct v9fs_session_info *v9ses;
-- 
cgit v1.2.3


From 6e8dc55550273084b7fb5846df2f44439f5d03d9 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:40 +0530
Subject: fs/9p: Update ACL on chmod

We need update the acl value on chmod

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c       | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/acl.h       |  5 +++++
 fs/9p/vfs_inode.c |  6 ++++++
 3 files changed, 66 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index cad38bc1710e..8f2acde74c05 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -97,6 +97,61 @@ int v9fs_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
+static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+{
+	int retval;
+	char *name;
+	size_t size;
+	void *buffer;
+	struct inode *inode = dentry->d_inode;
+
+	set_cached_acl(inode, type, acl);
+	/* Set a setxattr request to server */
+	size = posix_acl_xattr_size(acl->a_count);
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+	retval = posix_acl_to_xattr(acl, buffer, size);
+	if (retval < 0)
+		goto err_free_out;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+err_free_out:
+	kfree(buffer);
+	return retval;
+}
+
+int v9fs_acl_chmod(struct dentry *dentry)
+{
+	int retval = 0;
+	struct posix_acl *acl, *clone;
+	struct inode *inode = dentry->d_inode;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	if (acl) {
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		posix_acl_release(acl);
+		if (!clone)
+			return -ENOMEM;
+		retval = posix_acl_chmod_masq(clone, inode->i_mode);
+		if (!retval)
+			retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
+		posix_acl_release(clone);
+	}
+	return retval;
+}
+
 static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 			      void *buffer, size_t size, int type)
 {
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index b1414f7c82e0..0adcc4326d18 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -17,11 +17,16 @@
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
 extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_acl_chmod(struct dentry *);
 #else
 #define v9fs_check_acl NULL
 static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	return 0;
 }
+static inline int v9fs_acl_chmod(struct dentry *dentry)
+{
+	return 0;
+}
 #endif
 #endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4b67bf1fb1d5..bdc64d1c22fb 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1284,6 +1284,12 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 
 	setattr_copy(dentry->d_inode, iattr);
 	mark_inode_dirty(dentry->d_inode);
+	if (iattr->ia_valid & ATTR_MODE) {
+		/* We also want to update ACL when we update mode bits */
+		retval = v9fs_acl_chmod(dentry);
+		if (retval < 0)
+			return retval;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From ad77dbce567128d59b37a14c9562c8af6f63aeca Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:40 +0530
Subject: fs/9p: Implement create time inheritance

Inherit default ACL on create

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c       |  52 ++++++++++++++++++++
 fs/9p/acl.h       |  17 +++++++
 fs/9p/vfs_inode.c | 141 +++++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 166 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 8f2acde74c05..8b3c54a4958c 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -152,6 +152,58 @@ int v9fs_acl_chmod(struct dentry *dentry)
 	return retval;
 }
 
+int v9fs_set_create_acl(struct dentry *dentry,
+			struct posix_acl *dpacl, struct posix_acl *pacl)
+{
+	if (dpacl)
+		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+	if (pacl)
+		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
+	posix_acl_release(dpacl);
+	posix_acl_release(pacl);
+	return 0;
+}
+
+int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+		  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+	int retval = 0;
+	mode_t mode = *modep;
+	struct posix_acl *acl = NULL;
+
+	if (!S_ISLNK(mode)) {
+		acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			mode &= ~current_umask();
+	}
+	if (acl) {
+		struct posix_acl *clone;
+
+		if (S_ISDIR(mode))
+			*dpacl = acl;
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		retval = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		retval = posix_acl_create_masq(clone, &mode);
+		if (retval < 0) {
+			posix_acl_release(clone);
+			goto cleanup;
+		}
+		if (retval > 0)
+			*pacl = clone;
+	}
+	*modep  = mode;
+	return 0;
+cleanup:
+	posix_acl_release(acl);
+	return retval;
+
+}
+
 static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 			      void *buffer, size_t size, int type)
 {
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 0adcc4326d18..59e18c2e8c7e 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -18,6 +18,10 @@
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
 extern int v9fs_check_acl(struct inode *inode, int mask);
 extern int v9fs_acl_chmod(struct dentry *);
+extern int v9fs_set_create_acl(struct dentry *,
+			       struct posix_acl *, struct posix_acl *);
+extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+			 struct posix_acl **dpacl, struct posix_acl **pacl);
 #else
 #define v9fs_check_acl NULL
 static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
@@ -28,5 +32,18 @@ static inline int v9fs_acl_chmod(struct dentry *dentry)
 {
 	return 0;
 }
+static inline int v9fs_set_create_acl(struct dentry *dentry,
+				      struct posix_acl *dpacl,
+				      struct posix_acl *pacl)
+{
+	return 0;
+}
+static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+				struct posix_acl **dpacl,
+				struct posix_acl **pacl)
+{
+	return 0;
+}
+
 #endif
 #endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bdc64d1c22fb..68f02973c338 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -662,19 +662,21 @@ error:
  */
 
 static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		struct nameidata *nd)
 {
 	int err = 0;
 	char *name = NULL;
 	gid_t gid;
 	int flags;
+	mode_t mode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL;
 	struct p9_fid *dfid, *ofid;
 	struct file *filp;
 	struct p9_qid qid;
 	struct inode *inode;
+	struct posix_acl *pacl = NULL, *dacl = NULL;
 
 	v9ses = v9fs_inode2v9ses(dir);
 	if (nd && nd->flags & LOOKUP_OPEN)
@@ -684,7 +686,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
 
 	name = (char *) dentry->d_name.name;
 	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-			"mode:0x%x\n", name, flags, mode);
+			"mode:0x%x\n", name, flags, omode);
 
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
@@ -702,6 +704,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
 	}
 
 	gid = v9fs_get_fsgid_for_create(dir);
+
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in creat %d\n", err);
+		goto error;
+	}
 	err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
 	if (err < 0) {
 		P9_DPRINTK(P9_DEBUG_VFS,
@@ -709,42 +720,48 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
 				err);
 		goto error;
 	}
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
+	    (nd && nd->flags & LOOKUP_OPEN)) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
 
-	/* No need to populate the inode if we are not opening the file AND
-	 * not in cached mode.
-	 */
-	if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
-		/* Not in cached mode. No need to populate inode with stat */
-		dentry->d_op = &v9fs_dentry_operations;
-		p9_client_clunk(ofid);
-		d_instantiate(dentry, NULL);
-		return 0;
-	}
-
-	/* Now walk from the parent so we can get an unopened fid. */
-	fid = p9_client_walk(dfid, 1, &name, 1);
-	if (IS_ERR(fid)) {
-		err = PTR_ERR(fid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		fid = NULL;
-		goto error;
-	}
-
-	/* instantiate inode and assign the unopened fid to dentry */
-	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
-		goto error;
-	}
-	if (v9ses->cache)
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
 		dentry->d_op = &v9fs_cached_dentry_operations;
-	else
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		/* The fid would get clunked via a dput */
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
 		dentry->d_op = &v9fs_dentry_operations;
-	d_instantiate(dentry, inode);
-	err = v9fs_fid_add(dentry, fid);
-	if (err < 0)
-		goto error;
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
 
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
@@ -866,25 +883,28 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  *
  */
 
-static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
-					int mode)
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+			       struct dentry *dentry, int omode)
 {
 	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	gid_t gid;
 	char *name;
+	mode_t mode;
 	struct inode *inode;
 	struct p9_qid qid;
 	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 	err = 0;
 	v9ses = v9fs_inode2v9ses(dir);
 
-	mode |= S_IFDIR;
+	omode |= S_IFDIR;
 	if (dir->i_mode & S_ISGID)
-		mode |= S_ISGID;
+		omode |= S_ISGID;
+
 	dir_dentry = v9fs_dentry_from_dir_inode(dir);
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid)) {
@@ -895,7 +915,14 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	gid = v9fs_get_fsgid_for_create(dir);
-
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mkdir %d\n", err);
+		goto error;
+	}
 	name = (char *) dentry->d_name.name;
 	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
 	if (err < 0)
@@ -925,7 +952,23 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
 		if (err < 0)
 			goto error;
 		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		dentry->d_op = &v9fs_dentry_operations;
+		d_instantiate(dentry, inode);
 	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+
 error:
 	if (fid)
 		p9_client_clunk(fid);
@@ -1861,21 +1904,23 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
  *
  */
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		dev_t rdev)
 {
 	int err;
 	char *name;
+	mode_t mode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	struct inode *inode;
 	gid_t gid;
 	struct p9_qid qid;
 	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
 
 	P9_DPRINTK(P9_DEBUG_VFS,
 		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -1891,7 +1936,14 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
 	}
 
 	gid = v9fs_get_fsgid_for_create(dir);
-
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mknod %d\n", err);
+		goto error;
+	}
 	name = (char *) dentry->d_name.name;
 
 	err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
@@ -1935,7 +1987,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
 		dentry->d_op = &v9fs_dentry_operations;
 		d_instantiate(dentry, inode);
 	}
-
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
 error:
 	if (fid)
 		p9_client_clunk(fid);
-- 
cgit v1.2.3


From 76381a42e4a5606774fd48413e6282cd7130ff2c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 28 Sep 2010 00:27:41 +0530
Subject: fs/9p: Add access = client option to opt in acl evaluation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 Documentation/filesystems/9p.txt |  4 ++-
 fs/9p/acl.c                      | 78 +++++++++++++++++++++++++++++++++++++++-
 fs/9p/fid.c                      |  1 +
 fs/9p/v9fs.c                     | 22 +++++++++++-
 fs/9p/v9fs.h                     |  8 +++--
 fs/9p/vfs_super.c                |  4 +--
 6 files changed, 110 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index f9765e8cf086..b22abba78fed 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -111,7 +111,7 @@ OPTIONS
   		This can be used to share devices/named pipes/sockets between
 		hosts.  This functionality will be expanded in later versions.
 
-  access	there are three access modes.
+  access	there are four access modes.
 			user  = if a user tries to access a file on v9fs
 			        filesystem for the first time, v9fs sends an
 			        attach command (Tattach) for that user.
@@ -120,6 +120,8 @@ OPTIONS
 				the files on the mounted filesystem
 			any   = v9fs does single attach and performs all
 				operations as one user
+			client = ACL based access check on the 9p client
+			         side for access validation
 
   cachetag	cache tag to use the specified persistent cache.
 		cache tags for existing cache sessions can be listed at
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 8b3c54a4958c..12d602351dbe 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -22,6 +22,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "v9fs_vfs.h"
+#include "v9fs.h"
 
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -55,7 +56,14 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	int retval = 0;
 	struct posix_acl *pacl, *dacl;
+	struct v9fs_session_info *v9ses;
 
+	v9ses = v9fs_inode2v9ses(inode);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+		return 0;
+	}
 	/* get the default/access acl values and cache them */
 	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
 	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
@@ -85,7 +93,18 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 
 int v9fs_check_acl(struct inode *inode, int mask)
 {
-	struct posix_acl *acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		/*
+		 * On access = client mode get the acl
+		 * values from the server
+		 */
+		return 0;
+	}
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
 
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
@@ -204,15 +223,41 @@ cleanup:
 
 }
 
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
 static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 			      void *buffer, size_t size, int type)
 {
+	struct v9fs_session_info *v9ses;
 	struct posix_acl *acl;
 	int error;
 
 	if (strcmp(name, "") != 0)
 		return -EINVAL;
 
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	/*
+	 * We allow set/get/list of acl when access=client is not specified
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+
 	acl = v9fs_get_cached_acl(dentry->d_inode, type);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
@@ -224,16 +269,47 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 	return error;
 }
 
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+
 static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 			      const void *value, size_t size,
 			      int flags, int type)
 {
 	int retval;
 	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
 	struct inode *inode = dentry->d_inode;
 
 	if (strcmp(name, "") != 0)
 		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	/*
+	 * set the attribute on the remote. Without even looking at the
+	 * xattr value. We leave it to the server to validate
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_set_acl(dentry, name,
+					   value, size, flags, type);
+
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 	if (!is_owner_or_cap(inode))
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf95..b00223c99d70 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	switch (access) {
 	case V9FS_ACCESS_SINGLE:
 	case V9FS_ACCESS_USER:
+	case V9FS_ACCESS_CLIENT:
 		uid = current_fsuid();
 		any = 0;
 		break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e067599..2f77cd33ba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				v9ses->flags |= V9FS_ACCESS_USER;
 			else if (strcmp(s, "any") == 0)
 				v9ses->flags |= V9FS_ACCESS_ANY;
-			else {
+			else if (strcmp(s, "client") == 0) {
+#ifdef CONFIG_9P_FS_POSIX_ACL
+				v9ses->flags |= V9FS_ACCESS_CLIENT;
+#else
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					"access=client option not supported\n");
+				kfree(s);
+				ret = -EINVAL;
+				goto free_and_return;
+#endif
+			} else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
 				v9ses->uid = simple_strtoul(s, &e, 10);
 				if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
+	if (!v9fs_proto_dotl(v9ses) &&
+	    ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACCESS_CLIENT only for dotl.
+		 * Fall back to ACCESS_USER
+		 */
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_USER;
+	}
+	/*FIXME !! */
 	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
 	if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
 		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41f..8bb7792afe2e 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
  *
  * Session flags reflect options selected by users at mount time
  */
+#define	V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+			 V9FS_ACCESS_USER |   \
+			 V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+
 enum p9_session_flags {
 	V9FS_PROTO_2000U	= 0x01,
 	V9FS_PROTO_2000L	= 0x02,
 	V9FS_ACCESS_SINGLE	= 0x04,
 	V9FS_ACCESS_USER	= 0x08,
-	V9FS_ACCESS_ANY		= 0x0C,
-	V9FS_ACCESS_MASK	= 0x0C,
+	V9FS_ACCESS_CLIENT	= 0x10
 };
 
 /* possible values of ->cache */
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 14da5778d44e..174643f4f901 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -90,7 +90,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	    MS_NOATIME;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
-	sb->s_flags |= MS_POSIXACL;
+	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+		sb->s_flags |= MS_POSIXACL;
 #endif
 
 	save_mount_options(sb, data);
@@ -181,7 +182,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto release_sb;
-
 	v9fs_fid_add(root, fid);
 
 	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-- 
cgit v1.2.3


From b04faaf3717307cd976a15667c8c24161c1d24ef Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Wed, 22 Sep 2010 16:30:52 -0700
Subject: [fs/9p] Add file_operations for cached mode in dotl protocol.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index f455c45a8c5f..28db7fb1d96e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,6 +44,7 @@
 #include "cache.h"
 
 static const struct file_operations v9fs_cached_file_operations;
+static const struct file_operations v9fs_cached_file_operations_dotl;
 
 /**
  * v9fs_file_open - open a file (or directory)
@@ -92,6 +93,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		/* enable cached file options */
 		if(file->f_op == &v9fs_file_operations)
 			file->f_op = &v9fs_cached_file_operations;
+		else if (file->f_op == &v9fs_file_operations_dotl)
+			file->f_op = &v9fs_cached_file_operations_dotl;
 
 #ifdef CONFIG_9P_FSCACHE
 		v9fs_cache_inode_set_cookie(inode, file);
@@ -299,6 +302,18 @@ static const struct file_operations v9fs_cached_file_operations = {
 	.fsync = v9fs_file_fsync,
 };
 
+static const struct file_operations v9fs_cached_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = do_sync_read,
+	.aio_read = generic_file_aio_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
 const struct file_operations v9fs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_file_read,
-- 
cgit v1.2.3


From 920e65dc6911da28a58e17f4b683302636fc6d8e Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Wed, 22 Sep 2010 17:19:19 -0700
Subject: [9p] Introduce client side TFSYNC/RFSYNC for dotl.

SYNOPSIS
    size[4] Tfsync tag[2] fid[4]

    size[4] Rfsync tag[2]

DESCRIPTION

The Tfsync transaction transfers ("flushes") all modified in-core data of
file identified by fid to the disk device (or other  permanent  storage
device)  where that  file  resides.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c        | 18 ++++++++++++++++--
 include/net/9p/9p.h     |  2 ++
 include/net/9p/client.h |  1 +
 net/9p/client.c         | 25 +++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 28db7fb1d96e..fdf303207c72 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -290,6 +290,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
 	return retval;
 }
 
+static int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+{
+	struct p9_fid *fid;
+	int retval;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+			filp, datasync);
+
+	fid = filp->private_data;
+
+	retval = p9_client_fsync(fid);
+	return retval;
+}
+
 static const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
@@ -311,7 +325,7 @@ static const struct file_operations v9fs_cached_file_operations_dotl = {
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
 	.mmap = generic_file_readonly_mmap,
-	.fsync = v9fs_file_fsync,
+	.fsync = v9fs_file_fsync_dotl,
 };
 
 const struct file_operations v9fs_file_operations = {
@@ -333,5 +347,5 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
 	.mmap = generic_file_readonly_mmap,
-	.fsync = v9fs_file_fsync,
+	.fsync = v9fs_file_fsync_dotl,
 };
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index a4a1b043a8c4..55e96057f47f 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -163,6 +163,8 @@ enum p9_msg_t {
 	P9_RXATTRCREATE,
 	P9_TREADDIR = 40,
 	P9_RREADDIR,
+	P9_TFSYNC = 50,
+	P9_RFSYNC,
 	P9_TLINK = 70,
 	P9_RLINK,
 	P9_TMKDIR = 72,
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 7f63d5ab7b44..8744e3ad4a07 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -229,6 +229,7 @@ int p9_client_symlink(struct p9_fid *fid, char *name, char *symname, gid_t gid,
 int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
 		gid_t gid, struct p9_qid *qid);
 int p9_client_clunk(struct p9_fid *fid);
+int p9_client_fsync(struct p9_fid *fid);
 int p9_client_remove(struct p9_fid *fid);
 int p9_client_read(struct p9_fid *fid, char *data, char __user *udata,
 							u64 offset, u32 count);
diff --git a/net/9p/client.c b/net/9p/client.c
index e50ec802937a..30c4a1b224fb 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1162,6 +1162,31 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
 }
 EXPORT_SYMBOL(p9_client_link);
 
+int p9_client_fsync(struct p9_fid *fid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d\n", fid->fid);
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TFSYNC, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_fsync);
+
 int p9_client_clunk(struct p9_fid *fid)
 {
 	int err;
-- 
cgit v1.2.3


From a099027c779068b834f335cfdc3f2bf10f531dd9 Mon Sep 17 00:00:00 2001
From: "M. Mohan Kumar" <mohan@in.ibm.com>
Date: Mon, 27 Sep 2010 11:34:24 +0530
Subject: 9p: Implement TLOCK

Synopsis

    size[4] TLock tag[2] fid[4] flock[n]
    size[4] RLock tag[2] status[1]

Description

Tlock is used to acquire/release byte range posix locks on a file
identified by given fid. The reply contains status of the lock request

    flock structure:
        type[1] - Type of lock: F_RDLCK, F_WRLCK, F_UNLCK
        flags[4] - Flags could be either of
          P9_LOCK_FLAGS_BLOCK - Blocked lock request, if there is a
            conflicting lock exists, wait for that lock to be released.
          P9_LOCK_FLAGS_RECLAIM - Reclaim lock request, used when client is
            trying to reclaim a lock after a server restrart (due to crash)
        start[8] - Starting offset for lock
        length[8] - Number of bytes to lock
          If length is 0, lock all bytes starting at the location 'start'
          through to the end of file
        pid[4] - PID of the process that wants to take lock
        client_id[4] - Unique client id

        status[1] - Status of the lock request, can be
          P9_LOCK_SUCCESS(0), P9_LOCK_BLOCKED(1), P9_LOCK_ERROR(2) or
          P9_LOCK_GRACE(3)
          P9_LOCK_SUCCESS - Request was successful
          P9_LOCK_BLOCKED - A conflicting lock is held by another process
          P9_LOCK_ERROR - Error while processing the lock request
          P9_LOCK_GRACE - Server is in grace period, it can't accept new lock
            requests in this period (except locks with
            P9_LOCK_FLAGS_RECLAIM flag set)

Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs_vfs.h        |   2 +
 fs/9p/vfs_file.c        | 160 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/net/9p/9p.h     |  28 +++++++++
 include/net/9p/client.h |   1 +
 net/9p/client.c         |  33 ++++++++++
 5 files changed, 222 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 09861295eef9..d26db1932f7a 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -65,3 +65,5 @@ int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+
+#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index fdf303207c72..6f77abd23184 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
 #include <linux/inet.h>
 #include <linux/list.h>
 #include <linux/pagemap.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -133,6 +134,159 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 	return res;
 }
 
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct p9_flock flock;
+	struct p9_fid *fid;
+	uint8_t status;
+	int res = 0;
+	unsigned char fl_type;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+		BUG();
+
+	res = posix_lock_file_wait(filp, fl);
+	if (res < 0)
+		goto out;
+
+	/* convert posix lock to p9 tlock args */
+	memset(&flock, 0, sizeof(flock));
+	flock.type = fl->fl_type;
+	flock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		flock.length = 0;
+	else
+		flock.length = fl->fl_end - fl->fl_start + 1;
+	flock.proc_id = fl->fl_pid;
+	flock.client_id = utsname()->nodename;
+	if (IS_SETLKW(cmd))
+		flock.flags = P9_LOCK_FLAGS_BLOCK;
+
+	/*
+	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+	 * for lock request, keep on trying
+	 */
+	for (;;) {
+		res = p9_client_lock_dotl(fid, &flock, &status);
+		if (res < 0)
+			break;
+
+		if (status != P9_LOCK_BLOCKED)
+			break;
+		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+			break;
+		schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+	}
+
+	/* map 9p status to VFS status */
+	switch (status) {
+	case P9_LOCK_SUCCESS:
+		res = 0;
+		break;
+	case P9_LOCK_BLOCKED:
+		res = -EAGAIN;
+		break;
+	case P9_LOCK_ERROR:
+	case P9_LOCK_GRACE:
+		res = -ENOLCK;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * incase server returned error for lock request, revert
+	 * it locally
+	 */
+	if (res < 0 && fl->fl_type != F_UNLCK) {
+		fl_type = fl->fl_type;
+		fl->fl_type = F_UNLCK;
+		res = posix_lock_file_wait(filp, fl);
+		fl->fl_type = fl_type;
+	}
+out:
+	return res;
+}
+
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = -ENOLCK;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+				cmd, fl, filp->f_path.dentry->d_name.name);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+	struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = -ENOLCK;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+				cmd, fl, filp->f_path.dentry->d_name.name);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+	/* Convert flock to posix lock */
+	fl->fl_owner = (fl_owner_t)filp;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_flags |= FL_POSIX;
+	fl->fl_flags ^= FL_FLOCK;
+
+	if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
 /**
  * v9fs_file_readn - read from a file
  * @filp: file pointer to read
@@ -323,7 +477,8 @@ static const struct file_operations v9fs_cached_file_operations_dotl = {
 	.write = v9fs_file_write,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
 	.mmap = generic_file_readonly_mmap,
 	.fsync = v9fs_file_fsync_dotl,
 };
@@ -345,7 +500,8 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.write = v9fs_file_write,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
 	.mmap = generic_file_readonly_mmap,
 	.fsync = v9fs_file_fsync_dotl,
 };
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 55e96057f47f..1859a2560cc5 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -165,6 +165,8 @@ enum p9_msg_t {
 	P9_RREADDIR,
 	P9_TFSYNC = 50,
 	P9_RFSYNC,
+	P9_TLOCK = 52,
+	P9_RLOCK,
 	P9_TLINK = 70,
 	P9_RLINK,
 	P9_TMKDIR = 72,
@@ -464,6 +466,32 @@ struct p9_iattr_dotl {
 	u64 mtime_nsec;
 };
 
+#define P9_LOCK_SUCCESS 0
+#define P9_LOCK_BLOCKED 1
+#define P9_LOCK_ERROR 2
+#define P9_LOCK_GRACE 3
+
+#define P9_LOCK_FLAGS_BLOCK 1
+#define P9_LOCK_FLAGS_RECLAIM 2
+
+/* struct p9_flock: POSIX lock structure
+ * @type - type of lock
+ * @flags - lock flags
+ * @start - starting offset of the lock
+ * @length - number of bytes
+ * @proc_id - process id which wants to take lock
+ * @client_id - client id
+ */
+
+struct p9_flock {
+	u8 type;
+	u32 flags;
+	u64 start;
+	u64 length;
+	u32 proc_id;
+	char *client_id;
+};
+
 /* Structures for Protocol Operations */
 struct p9_tstatfs {
 	u32 fid;
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 8744e3ad4a07..d7dcb142e3bb 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -249,6 +249,7 @@ int p9_client_mknod_dotl(struct p9_fid *oldfid, char *name, int mode,
 			dev_t rdev, gid_t gid, struct p9_qid *);
 int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
 				gid_t gid, struct p9_qid *);
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
 struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
 void p9_client_cb(struct p9_client *c, struct p9_req_t *req);
 
diff --git a/net/9p/client.c b/net/9p/client.c
index 30c4a1b224fb..fbd2b195801c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1803,3 +1803,36 @@ error:
 
 }
 EXPORT_SYMBOL(p9_client_mkdir_dotl);
+
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
+			"start %lld length %lld proc_id %d client_id %s\n",
+			fid->fid, flock->type, flock->flags, flock->start,
+			flock->length, flock->proc_id, flock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
+				flock->flags, flock->start, flock->length,
+					flock->proc_id, flock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_lock_dotl);
-- 
cgit v1.2.3


From 1d769cd192fc8c4097b1e2cd41fdee6ba3d1b2af Mon Sep 17 00:00:00 2001
From: "M. Mohan Kumar" <mohan@in.ibm.com>
Date: Mon, 27 Sep 2010 12:22:13 +0530
Subject: 9p: Implement TGETLOCK

Synopsis

    size[4] TGetlock tag[2] fid[4] getlock[n]
    size[4] RGetlock tag[2] getlock[n]

Description

TGetlock is used to test for the existence of byte range posix locks on a file
identified by given fid. The reply contains getlock structure. If the lock could
be placed it returns F_UNLCK in type field of getlock structure.  Otherwise it
returns the details of the conflicting locks in the getlock structure

    getlock structure:
      type[1] - Type of lock: F_RDLCK, F_WRLCK
      start[8] - Starting offset for lock
      length[8] - Number of bytes to check for the lock
             If length is 0, check for lock in all bytes starting at the location
            'start' through to the end of file
      pid[4] - PID of the process that wants to take lock/owns the task
               in case of reply
      client[4] - Client id of the system that owns the process which
                  has the conflicting lock

Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c        | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/net/9p/9p.h     | 18 ++++++++++++++++++
 include/net/9p/client.h |  1 +
 net/9p/client.c         | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6f77abd23184..3a4352f23a98 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -211,6 +211,51 @@ out:
 	return res;
 }
 
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+	struct p9_getlock glock;
+	struct p9_fid *fid;
+	int res = 0;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	posix_test_lock(filp, fl);
+	/*
+	 * if we have a conflicting lock locally, no need to validate
+	 * with server
+	 */
+	if (fl->fl_type != F_UNLCK)
+		return res;
+
+	/* convert posix lock to p9 tgetlock args */
+	memset(&glock, 0, sizeof(glock));
+	glock.type = fl->fl_type;
+	glock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		glock.length = 0;
+	else
+		glock.length = fl->fl_end - fl->fl_start + 1;
+	glock.proc_id = fl->fl_pid;
+	glock.client_id = utsname()->nodename;
+
+	res = p9_client_getlock_dotl(fid, &glock);
+	if (res < 0)
+		return res;
+	if (glock.type != F_UNLCK) {
+		fl->fl_type = glock.type;
+		fl->fl_start = glock.start;
+		if (glock.length == 0)
+			fl->fl_end = OFFSET_MAX;
+		else
+			fl->fl_end = glock.start + glock.length - 1;
+		fl->fl_pid = glock.proc_id;
+	} else
+		fl->fl_type = F_UNLCK;
+
+	return res;
+}
+
 /**
  * v9fs_file_lock_dotl - lock a file (or directory)
  * @filp: file to be locked
@@ -238,6 +283,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
 
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd))
 		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else if (IS_GETLK(cmd))
+		ret = v9fs_file_getlock(filp, fl);
 	else
 		ret = -EINVAL;
 out_err:
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 1859a2560cc5..6367a71d84fc 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -167,6 +167,8 @@ enum p9_msg_t {
 	P9_RFSYNC,
 	P9_TLOCK = 52,
 	P9_RLOCK,
+	P9_TGETLOCK = 54,
+	P9_RGETLOCK,
 	P9_TLINK = 70,
 	P9_RLINK,
 	P9_TMKDIR = 72,
@@ -492,6 +494,22 @@ struct p9_flock {
 	char *client_id;
 };
 
+/* struct p9_getlock: getlock structure
+ * @type - type of lock
+ * @start - starting offset of the lock
+ * @length - number of bytes
+ * @proc_id - process id which wants to take lock
+ * @client_id - client id
+ */
+
+struct p9_getlock {
+	u8 type;
+	u64 start;
+	u64 length;
+	u32 proc_id;
+	char *client_id;
+};
+
 /* Structures for Protocol Operations */
 struct p9_tstatfs {
 	u32 fid;
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index d7dcb142e3bb..127c9f2a9cb8 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -250,6 +250,7 @@ int p9_client_mknod_dotl(struct p9_fid *oldfid, char *name, int mode,
 int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
 				gid_t gid, struct p9_qid *);
 int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
 struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
 void p9_client_cb(struct p9_client *c, struct p9_req_t *req);
 
diff --git a/net/9p/client.c b/net/9p/client.c
index fbd2b195801c..fc1b0579016a 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1836,3 +1836,37 @@ error:
 
 }
 EXPORT_SYMBOL(p9_client_lock_dotl);
+
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
+		"length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
+		glock->start, glock->length, glock->proc_id, glock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid,  glock->type,
+		glock->start, glock->length, glock->proc_id, glock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type,
+			&glock->start, &glock->length, &glock->proc_id,
+			&glock->client_id);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
+		"proc_id %d client_id %s\n", glock->type, glock->start,
+		glock->length, glock->proc_id, glock->client_id);
+error:
+	p9_free_req(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_getlock_dotl);
-- 
cgit v1.2.3


From 368c09d2a303c39e9f37193b23e945e6754cf0a7 Mon Sep 17 00:00:00 2001
From: "M. Mohan Kumar" <mohan@in.ibm.com>
Date: Mon, 27 Sep 2010 14:17:24 +0530
Subject: 9p: Use V9FS_MAGIC in statfs

Use V9FS_MAGIC as the file system type while filling kernel statfs
strucutre instead of using host file system magic number. Also move
the definition of V9FS_MAGIC from v9fs.h to standard magic.h file.

Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.h          | 2 --
 fs/9p/vfs_super.c     | 3 ++-
 include/linux/magic.h | 1 +
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 8bb7792afe2e..cb6396855e2d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -117,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 
-#define V9FS_MAGIC 0x01021997
-
 /* other default globals */
 #define V9FS_PORT	564
 #define V9FS_DEFUSER	"nobody"
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 174643f4f901..48d4215c60a8 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,6 +39,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/magic.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -256,7 +257,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (v9fs_proto_dotl(v9ses)) {
 		res = p9_client_statfs(fid, &rs);
 		if (res == 0) {
-			buf->f_type = rs.type;
+			buf->f_type = V9FS_MAGIC;
 			buf->f_bsize = rs.bsize;
 			buf->f_blocks = rs.blocks;
 			buf->f_bfree = rs.bfree;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index eb9800f05782..ff690d05f129 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -57,5 +57,6 @@
 
 #define DEVPTS_SUPER_MAGIC	0x1cd1
 #define SOCKFS_MAGIC		0x534F434B
+#define V9FS_MAGIC		0x01021997
 
 #endif /* __LINUX_MAGIC_H__ */
-- 
cgit v1.2.3


From 329176cc2c50e63c580ddaabb385876db5af1360 Mon Sep 17 00:00:00 2001
From: "M. Mohan Kumar" <mohan@in.ibm.com>
Date: Tue, 28 Sep 2010 19:59:25 +0530
Subject: 9p: Implement TREADLINK operation for 9p2000.L

Synopsis

	size[4] TReadlink tag[2] fid[4]
	size[4] RReadlink tag[2] target[s]

Description
	Readlink is used to return the contents of the symoblic link
        referred by fid. Contents of symboic link is returned as a
        response.

	target[s] - Contents of the symbolic link referred by fid.

Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c       | 60 ++++++++++++++++++++++++++++++++++++++++++++++---
 include/net/9p/9p.h     |  2 ++
 include/net/9p/client.h |  1 +
 net/9p/client.c         | 26 +++++++++++++++++++++
 4 files changed, 86 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 68f02973c338..88419073c654 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1527,7 +1527,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 
-	if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
+	if (!v9fs_proto_dotu(v9ses))
 		return -EBADF;
 
 	st = p9_client_stat(fid);
@@ -1995,6 +1995,60 @@ error:
 	return err;
 }
 
+static int
+v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
+{
+	int retval;
+	struct p9_fid *fid;
+	char *target = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+	retval = -EPERM;
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	retval = p9_client_readlink(fid, &target);
+	if (retval < 0)
+		return retval;
+
+	strncpy(buffer, target, buflen);
+	P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
+
+	retval = strnlen(buffer, buflen);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+	int len = 0;
+	char *link = __getname();
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+
+	if (!link)
+		link = ERR_PTR(-ENOMEM);
+	else {
+		len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
+		if (len < 0) {
+			__putname(link);
+			link = ERR_PTR(len);
+		} else
+			link[min(len, PATH_MAX-1)] = 0;
+	}
+	nd_set_link(nd, link);
+
+	return NULL;
+}
+
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
@@ -2064,8 +2118,8 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 };
 
 static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-	.readlink = generic_readlink,
-	.follow_link = v9fs_vfs_follow_link,
+	.readlink = v9fs_vfs_readlink_dotl,
+	.follow_link = v9fs_vfs_follow_link_dotl,
 	.put_link = v9fs_vfs_put_link,
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 6367a71d84fc..071fd7a8d781 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -153,6 +153,8 @@ enum p9_msg_t {
 	P9_RMKNOD,
 	P9_TRENAME = 20,
 	P9_RRENAME,
+	P9_TREADLINK = 22,
+	P9_RREADLINK,
 	P9_TGETATTR = 24,
 	P9_RGETATTR,
 	P9_TSETATTR = 26,
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 127c9f2a9cb8..335b088e7672 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -262,5 +262,6 @@ int p9_is_proto_dotu(struct p9_client *clnt);
 int p9_is_proto_dotl(struct p9_client *clnt);
 struct p9_fid *p9_client_xattrwalk(struct p9_fid *, const char *, u64 *);
 int p9_client_xattrcreate(struct p9_fid *, const char *, u64, int);
+int p9_client_readlink(struct p9_fid *fid, char **target);
 
 #endif /* NET_9P_CLIENT_H */
diff --git a/net/9p/client.c b/net/9p/client.c
index fc1b0579016a..2bc99e9031e7 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1870,3 +1870,29 @@ error:
 	return err;
 }
 EXPORT_SYMBOL(p9_client_getlock_dotl);
+
+int p9_client_readlink(struct p9_fid *fid, char **target)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+
+	req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+error:
+	p9_free_req(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readlink);
-- 
cgit v1.2.3


From f5fc6145f385a6287d3d63ee5cf3499ef038c699 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 Oct 2010 13:02:25 +0530
Subject: fs/9p: Use mknod 9p operation on create without open request

A create without LOOKUP_OPEN flag set is due to mknod of regular
files. Use mknod 9P operation for the same

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 88419073c654..8f68280d752e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -55,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
 static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		    dev_t rdev);
+
 /**
  * unixmode2p9mode - convert unix mode bits to plan 9
  * @v9ses: v9fs session information
@@ -681,8 +685,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	v9ses = v9fs_inode2v9ses(dir);
 	if (nd && nd->flags & LOOKUP_OPEN)
 		flags = nd->intent.open.flags - 1;
-	else
-		flags = O_RDWR;
+	else {
+		/*
+		 * create call without LOOKUP_OPEN is due
+		 * to mknod of regular files. So use mknod
+		 * operation.
+		 */
+		return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	}
 
 	name = (char *) dentry->d_name.name;
 	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-- 
cgit v1.2.3


From 9856af8b535aaf51d95dab2087e79508f551fbb8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Sep 2010 12:24:23 +0530
Subject: fs/9p: Add missing iput in v9fs_vfs_lookup

Make sure we drop inode reference in the error path

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8f68280d752e..a69986c80328 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1037,7 +1037,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	result = v9fs_fid_add(dentry, fid);
 	if (result < 0)
-		goto error;
+		goto error_iput;
 
 inst_out:
 	if (v9ses->cache)
@@ -1048,6 +1048,8 @@ inst_out:
 	d_add(dentry, inode);
 	return NULL;
 
+error_iput:
+	iput(inode);
 error:
 	p9_client_clunk(fid);
 
-- 
cgit v1.2.3


From 877cb3d4dd73838adcc6b79f2a3d29b155e7ebbe Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Sep 2010 12:42:30 +0530
Subject: fs/9p: Use generic_file_open with lookup_instantiate_filp

We need to do O_LARGEFILE check even in case of 9p. Use the
generic_file_open helper

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index a69986c80328..34bf71b56542 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -564,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	return retval;
 }
 
-static int
-v9fs_open_created(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
-
 /**
  * v9fs_create - Create a file
  * @v9ses: session information
@@ -775,7 +768,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
-		filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 		if (IS_ERR(filp)) {
 			p9_client_clunk(ofid);
 			return PTR_ERR(filp);
@@ -834,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
-		filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
 			goto error;
-- 
cgit v1.2.3


From b165d60145b717261a0234f989c442c2b68b6ec0 Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Fri, 22 Oct 2010 10:13:12 -0700
Subject: 9p: Add datasync to client side TFSYNC/RFSYNC for dotl

SYNOPSIS
    size[4] Tfsync tag[2] fid[4] datasync[4]

    size[4] Rfsync tag[2]

DESCRIPTION

    The Tfsync transaction transfers ("flushes") all modified in-core data of
    file identified by fid to the disk device (or other  permanent  storage
    device)  where that  file  resides.

    If datasync flag is specified data will be fleshed but does not flush
    modified metadata unless  that  metadata  is  needed  in order to allow a
    subsequent data retrieval to be correctly handled.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs_vfs.h        | 1 +
 fs/9p/vfs_dir.c         | 1 +
 fs/9p/vfs_file.c        | 4 ++--
 include/net/9p/client.h | 2 +-
 net/9p/client.c         | 7 ++++---
 5 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index d26db1932f7a..bab0eac873f4 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -65,5 +65,6 @@ int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, int datasync);
 
 #define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 170f5bb8ebe0..b84ebe8cefed 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -315,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
 	.readdir = v9fs_dir_readdir_dotl,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3a4352f23a98..240c30674396 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -491,7 +491,7 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
 	return retval;
 }
 
-static int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+int v9fs_file_fsync_dotl(struct file *filp, int datasync)
 {
 	struct p9_fid *fid;
 	int retval;
@@ -501,7 +501,7 @@ static int v9fs_file_fsync_dotl(struct file *filp, int datasync)
 
 	fid = filp->private_data;
 
-	retval = p9_client_fsync(fid);
+	retval = p9_client_fsync(fid, datasync);
 	return retval;
 }
 
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 335b088e7672..83ba6a4d58a3 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -229,7 +229,7 @@ int p9_client_symlink(struct p9_fid *fid, char *name, char *symname, gid_t gid,
 int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
 		gid_t gid, struct p9_qid *qid);
 int p9_client_clunk(struct p9_fid *fid);
-int p9_client_fsync(struct p9_fid *fid);
+int p9_client_fsync(struct p9_fid *fid, int datasync);
 int p9_client_remove(struct p9_fid *fid);
 int p9_client_read(struct p9_fid *fid, char *data, char __user *udata,
 							u64 offset, u32 count);
diff --git a/net/9p/client.c b/net/9p/client.c
index e3cfdff37327..8df80fb86f23 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1165,17 +1165,18 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
 }
 EXPORT_SYMBOL(p9_client_link);
 
-int p9_client_fsync(struct p9_fid *fid)
+int p9_client_fsync(struct p9_fid *fid, int datasync)
 {
 	int err;
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d\n", fid->fid);
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+			fid->fid, datasync);
 	err = 0;
 	clnt = fid->clnt;
 
-	req = p9_client_rpc(clnt, P9_TFSYNC, "d", fid->fid);
+	req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto error;
-- 
cgit v1.2.3


From 568a810d7edd58bd505222dd1c7e48895532290b Mon Sep 17 00:00:00 2001
From: Steve Dickson <steved@redhat.com>
Date: Thu, 28 Oct 2010 08:17:54 -0400
Subject: Fixed Regression in NFS Direct I/O path

A typo, introduced by commit f11ac8db, in the nfs_direct_write()
routine causes writes with O_DIRECT set to fail with a ENOMEM error.

Found-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve Dickson <steved@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 064a80961677..84d3c8b90206 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
-	if (dreq->l_ctx != NULL)
+	if (dreq->l_ctx == NULL)
 		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
-- 
cgit v1.2.3


From 015f0212d51d85bd281a831639a769b4a1a3307a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 28 Oct 2010 10:10:37 -0400
Subject: nfs: handle lock context allocation failures in nfs_create_request

nfs_get_lock_context can return NULL on an allocation failure.
Regression introduced by commit f11ac8db.

Reported-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 919490232e17..137b549e63db 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	if (req == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	/* get lock context early so we can deal with alloc failures */
+	req->wb_lock_context = nfs_get_lock_context(ctx);
+	if (req->wb_lock_context == NULL) {
+		nfs_page_free(req);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	/* Initialize the request struct. Initially, we assume a
 	 * long write-back delay. This will be adjusted in
 	 * update_nfs_request below if the region is not locked. */
@@ -79,7 +86,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
 	req->wb_context = get_nfs_open_context(ctx);
-	req->wb_lock_context = nfs_get_lock_context(ctx);
 	kref_init(&req->wb_kref);
 	return req;
 }
-- 
cgit v1.2.3


From 8f0d97b41523fb85a2d230f6794121e5834f0cf9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 28 Oct 2010 08:05:57 +0200
Subject: nfs: testing the wrong variable

The intent was to test "*desc" for allocation failures, but it tests
"desc" which is always a valid pointer here.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index dec47ed8b6b9..4e2d9b6b1380 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -123,7 +123,7 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
 	size_t desclen = typelen + namelen + 2;
 
 	*desc = kmalloc(desclen, GFP_KERNEL);
-	if (!desc)
+	if (!*desc)
 		return -ENOMEM;
 
 	cp = *desc;
-- 
cgit v1.2.3


From 19ef20143ff86c8012270c619ac7b6c3b389a8fa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Oct 2010 09:56:31 +0200
Subject: ext4: fix compile with CONFIG_EXT4_FS_XATTR disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 5dabfc78dced ("ext4: rename {exit,init}_ext4_*() to
ext4_{exit,init}_*()") causes

  fs/ext4/super.c:4776: error: implicit declaration of function ‘ext4_init_xattr’

when CONFIG_EXT4_FS_XATTR is disabled.

It renamed init_ext4_xattr to ext4_init_xattr but forgot to update the
dummy definition in fs/ext4/xattr.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/xattr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 281dd8353652..1ef16520b950 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -122,7 +122,7 @@ ext4_xattr_put_super(struct super_block *sb)
 }
 
 static __init inline int
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 07724586b44e69a08af3102be3f9afd350eda1e3 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Sun, 24 Oct 2010 23:13:57 +0100
Subject: Squashfs: fix use of __le64 annotated variable

This fixes a sparse with endian checking warning.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/xattr.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 652b8541f9c6..3876c36699a1 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
 					strncmp(target, name, name_size) == 0) {
 			/* found xattr */
 			if (type & SQUASHFS_XATTR_VALUE_OOL) {
-				__le64 xattr;
+				__le64 xattr_val;
+				u64 xattr;
 				/* val is a reference to the real location */
 				err = squashfs_read_metadata(sb, &val, &start,
 						&offset, sizeof(val));
 				if (err < 0)
 					goto failed;
-				err = squashfs_read_metadata(sb, &xattr, &start,
-					 &offset, sizeof(xattr));
+				err = squashfs_read_metadata(sb, &xattr_val,
+					&start, &offset, sizeof(xattr_val));
 				if (err < 0)
 					goto failed;
-				xattr = le64_to_cpu(xattr);
+				xattr = le64_to_cpu(xattr_val);
 				start = SQUASHFS_XATTR_BLK(xattr) +
 							msblk->xattr_table;
 				offset = SQUASHFS_XATTR_OFFSET(xattr);
-- 
cgit v1.2.3


From 5f3b321da1aa5936c85ed6dec67358239a9d3cfd Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 25 Oct 2010 00:17:24 +0100
Subject: Squashfs: fix function prototype

The fourth argument should be unsigned.  Also add missing include
so that the function prototype is defined in xattr_id.c

This fixes a couple of sparse warnings.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/xattr.h    | 4 ++--
 fs/squashfs/xattr_id.c | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 49fe0d719fbf..b634efce4bde 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -25,7 +25,7 @@
 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
 		u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
-		int *, unsigned long long *);
+		unsigned int *, unsigned long long *);
 #else
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 		u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 }
 
 static inline int squashfs_xattr_lookup(struct super_block *sb,
-		unsigned int index, int *count, int *size,
+		unsigned int index, int *count, unsigned int *size,
 		unsigned long long *xattr)
 {
 	return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098f..d33be5dd6c32 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -34,6 +34,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 /*
  * Map xattr id using the xattr id look up table
-- 
cgit v1.2.3


From 12364a4f05295cb1e4a161d36b486c248c11c485 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 28 Oct 2010 20:06:19 +0200
Subject: nfs4: The difference of 2 pointers is ptrdiff_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On m68k, which is 32-bit:

fs/nfs/nfs4proc.c: In function ‘nfs41_sequence_done’:
fs/nfs/nfs4proc.c:432: warning: format ‘%ld’ expects type ‘long int’, but argument 3 has type ‘int’
fs/nfs/nfs4proc.c: In function ‘nfs4_setup_sequence’:
fs/nfs/nfs4proc.c:576: warning: format ‘%ld’ expects type ‘long int’, but argument 5 has type ‘int’

On 32-bit, ptrdiff_t is int; on 64-bit, ptrdiff_t is long.

Introduced by commit dfb4f309830359352539919f23accc59a20a3758 ("NFSv4.1: keep
seq_res.sr_slot as pointer rather than an index")

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 32c8758c99fd..0f24cdf2cb13 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -429,7 +429,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
 		 * returned NFS4ERR_DELAY as per Section 2.10.6.2
 		 * of RFC5661.
 		 */
-		dprintk("%s: slot=%ld seq=%d: Operation in progress\n",
+		dprintk("%s: slot=%td seq=%d: Operation in progress\n",
 			__func__,
 			res->sr_slot - res->sr_session->fc_slot_table.slots,
 			res->sr_slot->seq_nr);
@@ -573,7 +573,7 @@ int nfs4_setup_sequence(const struct nfs_server *server,
 		goto out;
 	}
 
-	dprintk("--> %s clp %p session %p sr_slot %ld\n",
+	dprintk("--> %s clp %p session %p sr_slot %td\n",
 		__func__, session->clp, session, res->sr_slot ?
 			res->sr_slot - session->fc_slot_table.slots : -1);
 
-- 
cgit v1.2.3


From 0af3d00bad38d3bb9912a60928ad0669f17bdb76 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 21 Jun 2010 14:48:16 -0400
Subject: Btrfs: create special free space cache inode

In order to save free space cache, we need an inode to hold the data, and we
need a special item to point at the right inode for the right block group.  So
first, create a special item that will point to the right inode, and the number
of extent entries we will have and the number of bitmaps we will have.  We
truncate and pre-allocate space everytime to make sure it's uptodate.

This feature will be turned on as soon as you mount with -o space_cache, however
it is safe to boot into old kernels, they will just generate the cache the old
fashion way.  When you boot back into a newer kernel we will notice that we
modified and not the cache and automatically discard the cache.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |  74 ++++++++++++--
 fs/btrfs/disk-io.c          |   3 +-
 fs/btrfs/extent-tree.c      | 231 ++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/free-space-cache.c | 155 +++++++++++++++++++++++++++++
 fs/btrfs/free-space-cache.h |  11 +++
 fs/btrfs/inode.c            |  95 ++++++++++++++----
 fs/btrfs/relocation.c       |  91 ++++++++++++++++-
 fs/btrfs/super.c            |   7 +-
 fs/btrfs/transaction.c      |  43 ++++++---
 fs/btrfs/transaction.h      |   4 +
 10 files changed, 668 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eaf286abad17..46f52e1beade 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
 
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -265,6 +268,22 @@ struct btrfs_chunk {
 	/* additional stripes go here */
 } __attribute__ ((__packed__));
 
+#define BTRFS_FREE_SPACE_EXTENT	1
+#define BTRFS_FREE_SPACE_BITMAP	2
+
+struct btrfs_free_space_entry {
+	__le64 offset;
+	__le64 bytes;
+	u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+	struct btrfs_disk_key location;
+	__le64 generation;
+	__le64 num_entries;
+	__le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
 	BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
 
 	char label[BTRFS_LABEL_SIZE];
 
+	__le64 cache_generation;
+
 	/* future expansion */
-	__le64 reserved[32];
+	__le64 reserved[31];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -375,12 +396,12 @@ struct btrfs_super_block {
  * ones specified below then we will fail to mount
  */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(2ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP		\
-	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |	\
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
 
 /*
@@ -750,6 +771,14 @@ enum btrfs_caching_type {
 	BTRFS_CACHE_FINISHED	= 2,
 };
 
+enum btrfs_disk_cache_state {
+	BTRFS_DC_WRITTEN	= 0,
+	BTRFS_DC_ERROR		= 1,
+	BTRFS_DC_CLEAR		= 2,
+	BTRFS_DC_SETUP		= 3,
+	BTRFS_DC_NEED_WRITE	= 4,
+};
+
 struct btrfs_caching_control {
 	struct list_head list;
 	struct mutex mutex;
@@ -763,6 +792,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
@@ -773,8 +803,11 @@ struct btrfs_block_group_cache {
 	int extents_thresh;
 	int free_extents;
 	int total_bitmaps;
-	int ro;
-	int dirty;
+	int ro:1;
+	int dirty:1;
+	int iref:1;
+
+	int disk_cache_state;
 
 	/* cache tracking stuff */
 	int cached;
@@ -1192,6 +1225,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOSSD		(1 << 9)
 #define BTRFS_MOUNT_DISCARD		(1 << 10)
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1699,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
 	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
 
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+		   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+		   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+		   generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+					struct btrfs_free_space_header *h,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+					    struct btrfs_free_space_header *h,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
 /* struct btrfs_disk_key */
 BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
 			 objectid, 64);
@@ -1876,6 +1931,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
 			 incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
 			 csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+			 cache_generation, 64);
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
@@ -2115,6 +2172,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2426,6 +2484,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
 			      u64 start, u64 num_bytes, u64 min_size,
 			      loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..45cf64fc1e3e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1685,7 +1685,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh)
 		goto fail_iput;
@@ -1993,6 +1992,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_read(&fs_info->cleanup_work_sem);
 		btrfs_orphan_cleanup(fs_info->fs_root);
+		btrfs_orphan_cleanup(fs_info->tree_root);
 		up_read(&fs_info->cleanup_work_sem);
 	}
 
@@ -2421,6 +2421,7 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
+	btrfs_put_block_group_cache(fs_info);
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret =  btrfs_commit_super(root);
 		if (ret)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..aab40fb3faed 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2688,6 +2688,109 @@ next_block_group(struct btrfs_root *root,
 	return cache;
 }
 
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_path *path)
+{
+	struct btrfs_root *root = block_group->fs_info->tree_root;
+	struct inode *inode = NULL;
+	u64 alloc_hint = 0;
+	int num_pages = 0;
+	int retries = 0;
+	int ret = 0;
+
+	/*
+	 * If this block group is smaller than 100 megs don't bother caching the
+	 * block group.
+	 */
+	if (block_group->key.offset < (100 * 1024 * 1024)) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+
+again:
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+		ret = PTR_ERR(inode);
+		btrfs_release_path(root, path);
+		goto out;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retries);
+		retries++;
+
+		if (block_group->ro)
+			goto out_free;
+
+		ret = create_free_space_inode(root, trans, block_group, path);
+		if (ret)
+			goto out_free;
+		goto again;
+	}
+
+	/*
+	 * We want to set the generation to 0, that way if anything goes wrong
+	 * from here on out we know not to trust this cache when we load up next
+	 * time.
+	 */
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	WARN_ON(ret);
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_truncate_free_space_cache(root, trans, path,
+						      inode);
+		if (ret)
+			goto out_put;
+	}
+
+	spin_lock(&block_group->lock);
+	if (block_group->cached != BTRFS_CACHE_FINISHED) {
+		spin_unlock(&block_group->lock);
+		goto out_put;
+	}
+	spin_unlock(&block_group->lock);
+
+	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+	if (!num_pages)
+		num_pages = 1;
+
+	/*
+	 * Just to make absolutely sure we have enough space, we're going to
+	 * preallocate 12 pages worth of space for each block group.  In
+	 * practice we ought to use at most 8, but we need extra space so we can
+	 * add our header and have a terminator between the extents and the
+	 * bitmaps.
+	 */
+	num_pages *= 16;
+	num_pages *= PAGE_CACHE_SIZE;
+
+	ret = btrfs_check_data_free_space(inode, num_pages);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+					      num_pages, num_pages,
+					      &alloc_hint);
+	btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+	iput(inode);
+out_free:
+	btrfs_release_path(root, path);
+out:
+	spin_lock(&block_group->lock);
+	if (ret)
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+	else
+		block_group->disk_cache_state = BTRFS_DC_SETUP;
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
@@ -2700,6 +2803,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+again:
+	while (1) {
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+		err = cache_save_setup(cache, trans, path);
+		last = cache->key.objectid + cache->key.offset;
+		btrfs_put_block_group(cache);
+	}
+
 	while (1) {
 		if (last == 0) {
 			err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2831,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 		while (cache) {
+			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+				btrfs_put_block_group(cache);
+				goto again;
+			}
+
 			if (cache->dirty)
 				break;
 			cache = next_block_group(root, cache);
@@ -2883,11 +3010,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 used;
-	int ret = 0, committed = 0;
+	int ret = 0, committed = 0, alloc_chunk = 1;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
+	if (root == root->fs_info->tree_root) {
+		alloc_chunk = 0;
+		committed = 1;
+	}
+
 	data_sinfo = BTRFS_I(inode)->space_info;
 	if (!data_sinfo)
 		goto alloc;
@@ -2906,7 +3038,7 @@ again:
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
-		if (!data_sinfo->full) {
+		if (!data_sinfo->full && alloc_chunk) {
 			u64 alloc_target;
 
 			data_sinfo->force_alloc = 1;
@@ -3777,12 +3909,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc)
 {
-	struct btrfs_block_group_cache *cache;
+	struct btrfs_block_group_cache *cache = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
-	int factor;
 	u64 total = num_bytes;
 	u64 old_val;
 	u64 byte_in_group;
+	int factor;
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_lock);
@@ -3804,11 +3936,17 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			factor = 2;
 		else
 			factor = 1;
+
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
+
+		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		    cache->disk_cache_state < BTRFS_DC_CLEAR)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
 		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -7814,6 +7952,40 @@ out:
 	return ret;
 }
 
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 last = 0;
+
+	while (1) {
+		struct inode *inode;
+
+		block_group = btrfs_lookup_first_block_group(info, last);
+		while (block_group) {
+			spin_lock(&block_group->lock);
+			if (block_group->iref)
+				break;
+			spin_unlock(&block_group->lock);
+			block_group = next_block_group(info->tree_root,
+						       block_group);
+		}
+		if (!block_group) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		inode = block_group->inode;
+		block_group->iref = 0;
+		block_group->inode = NULL;
+		spin_unlock(&block_group->lock);
+		iput(inode);
+		last = block_group->key.objectid + block_group->key.offset;
+		btrfs_put_block_group(block_group);
+	}
+}
+
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
@@ -7897,6 +8069,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	int need_clear = 0;
+	u64 cache_gen;
 
 	root = info->extent_root;
 	key.objectid = 0;
@@ -7906,6 +8080,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
+	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	if (cache_gen != 0 &&
+	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+		need_clear = 1;
+
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0)
@@ -7928,6 +8107,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		INIT_LIST_HEAD(&cache->list);
 		INIT_LIST_HEAD(&cache->cluster_list);
 
+		if (need_clear)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
 		/*
 		 * we only want to have 32k of ram per block group for keeping
 		 * track of free space, and if we pass 1/2 of that we want to
@@ -8032,6 +8214,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.offset = size;
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
 
 	/*
 	 * we only want to have 32k of ram per block group for keeping track
@@ -8088,7 +8271,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_free_cluster *cluster;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_key key;
+	struct inode *inode;
 	int ret;
 
 	root = root->fs_info->extent_root;
@@ -8097,8 +8282,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!block_group);
 	BUG_ON(!block_group->ro);
 
-	memcpy(&key, &block_group->key, sizeof(key));
-
 	/* make sure this block group isn't part of an allocation cluster */
 	cluster = &root->fs_info->data_alloc_cluster;
 	spin_lock(&cluster->refill_lock);
@@ -8117,6 +8300,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (!IS_ERR(inode)) {
+		btrfs_orphan_add(trans, inode);
+		clear_nlink(inode);
+		/* One for the block groups ref */
+		spin_lock(&block_group->lock);
+		if (block_group->iref) {
+			block_group->iref = 0;
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			iput(inode);
+		} else {
+			spin_unlock(&block_group->lock);
+		}
+		/* One for our lookup ref */
+		iput(inode);
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0)
+		btrfs_release_path(tree_root, path);
+	if (ret == 0) {
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(tree_root, path);
+	}
+
 	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
@@ -8140,6 +8357,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
 	spin_unlock(&block_group->space_info->lock);
 
+	memcpy(&key, &block_group->key, sizeof(key));
+
 	btrfs_clear_space_info_full(root->fs_info);
 
 	btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d99..05efcc7061a7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,10 +23,165 @@
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
+#include "disk-io.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
 
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct inode *inode = NULL;
+	int ret;
+
+	spin_lock(&block_group->lock);
+	if (block_group->inode)
+		inode = igrab(block_group->inode);
+	spin_unlock(&block_group->lock);
+	if (inode)
+		return inode;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		btrfs_release_path(root, path);
+		return ERR_PTR(-ENOENT);
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_free_space_key(leaf, header, &disk_key);
+	btrfs_disk_key_to_cpu(&location, &disk_key);
+	btrfs_release_path(root, path);
+
+	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+	if (IS_ERR(inode))
+		return inode;
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		return ERR_PTR(-ENOENT);
+	}
+
+	spin_lock(&block_group->lock);
+	if (!root->fs_info->closing) {
+		block_group->inode = igrab(inode);
+		block_group->iref = 1;
+	}
+	spin_unlock(&block_group->lock);
+
+	return inode;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	u64 objectid;
+	int ret;
+
+	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+	if (ret < 0)
+		return ret;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		return ret;
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	btrfs_item_key(leaf, &disk_key, path->slots[0]);
+	memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
+	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+	btrfs_set_inode_size(leaf, inode_item, 0);
+	btrfs_set_inode_nbytes(leaf, inode_item, 0);
+	btrfs_set_inode_uid(leaf, inode_item, 0);
+	btrfs_set_inode_gid(leaf, inode_item, 0);
+	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+			      BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+	btrfs_set_inode_nlink(leaf, inode_item, 1);
+	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+	btrfs_set_inode_block_group(leaf, inode_item,
+				    block_group->key.objectid);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_free_space_header));
+	if (ret < 0) {
+		btrfs_release_path(root, path);
+		return ret;
+	}
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+	btrfs_set_free_space_key(leaf, header, &disk_key);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	return 0;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode)
+{
+	loff_t oldsize;
+	int ret = 0;
+
+	trans->block_rsv = root->orphan_block_rsv;
+	ret = btrfs_block_rsv_check(trans, root,
+				    root->orphan_block_rsv,
+				    0, 5);
+	if (ret)
+		return ret;
+
+	oldsize = i_size_read(inode);
+	btrfs_i_size_write(inode, 0);
+	truncate_pagecache(inode, oldsize, 0);
+
+	/*
+	 * We don't need an orphan item because truncating the free space cache
+	 * will never be split across transactions.
+	 */
+	ret = btrfs_truncate_inode_items(trans, root, inode,
+					 0, BTRFS_EXTENT_DATA_KEY);
+	if (ret) {
+		WARN_ON(1);
+		return ret;
+	}
+
+	return btrfs_update_inode(trans, root, inode);
+}
+
 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
 					  u64 offset)
 {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011b..45be29e5f01e 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,17 @@ struct btrfs_free_space {
 	struct list_head list;
 };
 
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path);
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode);
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..1af1ea88e8a8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1700,6 +1700,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->len);
 		BUG_ON(ret);
 	} else {
+		BUG_ON(root == root->fs_info->tree_root);
 		ret = insert_reserved_file_extent(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->start,
@@ -3196,7 +3197,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
-	if (root->ref_cows)
+	if (root->ref_cows || root == root->fs_info->tree_root)
 		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
 
 	path = btrfs_alloc_path();
@@ -3344,7 +3345,8 @@ delete:
 		} else {
 			break;
 		}
-		if (found_extent && root->ref_cows) {
+		if (found_extent && (root->ref_cows ||
+				     root == root->fs_info->tree_root)) {
 			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
@@ -3675,7 +3677,8 @@ void btrfs_evict_inode(struct inode *inode)
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
-	if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+			       root == root->fs_info->tree_root))
 		goto no_delete;
 
 	if (is_bad_inode(inode)) {
@@ -3888,7 +3891,14 @@ static void inode_tree_del(struct inode *inode)
 	}
 	spin_unlock(&root->inode_lock);
 
-	if (empty && btrfs_root_refs(&root->root_item) == 0) {
+	/*
+	 * Free space cache has inodes in the tree root, but the tree root has a
+	 * root_refs of 0, so this could end up dropping the tree root as a
+	 * snapshot, so we need the extra !root->fs_info->tree_root check to
+	 * make sure we don't drop it.
+	 */
+	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root) {
 		synchronize_srcu(&root->fs_info->subvol_srcu);
 		spin_lock(&root->inode_lock);
 		empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4282,14 +4292,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	bool nolock = false;
 
 	if (BTRFS_I(inode)->dummy_inode)
 		return 0;
 
+	smp_mb();
+	nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+
 	if (wbc->sync_mode == WB_SYNC_ALL) {
-		trans = btrfs_join_transaction(root, 1);
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root, 1);
+		else
+			trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
-		ret = btrfs_commit_transaction(trans, root);
+		if (nolock)
+			ret = btrfs_end_transaction_nolock(trans, root);
+		else
+			ret = btrfs_commit_transaction(trans, root);
 	}
 	return ret;
 }
@@ -6308,6 +6328,21 @@ void btrfs_destroy_inode(struct inode *inode)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 	}
 
+	if (root == root->fs_info->tree_root) {
+		struct btrfs_block_group_cache *block_group;
+
+		block_group = btrfs_lookup_block_group(root->fs_info,
+						BTRFS_I(inode)->block_group);
+		if (block_group && block_group->inode == inode) {
+			spin_lock(&block_group->lock);
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			btrfs_put_block_group(block_group);
+		} else if (block_group) {
+			btrfs_put_block_group(block_group);
+		}
+	}
+
 	spin_lock(&root->orphan_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6340,7 +6375,8 @@ int btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (btrfs_root_refs(&root->root_item) == 0)
+	if (btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root)
 		return 1;
 	else
 		return generic_drop_inode(inode);
@@ -6757,27 +6793,33 @@ out_unlock:
 	return err;
 }
 
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
-			      u64 start, u64 num_bytes, u64 min_size,
-			      loff_t actual_len, u64 *alloc_hint)
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+				       u64 start, u64 num_bytes, u64 min_size,
+				       loff_t actual_len, u64 *alloc_hint,
+				       struct btrfs_trans_handle *trans)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
 	int ret = 0;
+	bool own_trans = true;
 
+	if (trans)
+		own_trans = false;
 	while (num_bytes > 0) {
-		trans = btrfs_start_transaction(root, 3);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			break;
+		if (own_trans) {
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
 		}
 
 		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
 					   0, *alloc_hint, (u64)-1, &ins, 1);
 		if (ret) {
-			btrfs_end_transaction(trans, root);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
 			break;
 		}
 
@@ -6810,11 +6852,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
 
-		btrfs_end_transaction(trans, root);
+		if (own_trans)
+			btrfs_end_transaction(trans, root);
 	}
 	return ret;
 }
 
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint,
+					   NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint, trans);
+}
+
 static long btrfs_fallocate(struct inode *inode, int mode,
 			    loff_t offset, loff_t len)
 {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4a..af339eee55b8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
 #include "locking.h"
 #include "btrfs_inode.h"
 #include "async-thread.h"
+#include "free-space-cache.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -3191,6 +3192,54 @@ static int block_use_full_backref(struct reloc_control *rc,
 	return ret;
 }
 
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+				    struct inode *inode, u64 ino)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	int ret = 0;
+
+	if (inode)
+		goto truncate;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+		if (inode && !IS_ERR(inode))
+			iput(inode);
+		return -ENOENT;
+	}
+
+truncate:
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+
+	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+
+	btrfs_free_path(path);
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+out:
+	iput(inode);
+	return ret;
+}
+
 /*
  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
  * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3266,27 @@ static int find_data_references(struct reloc_control *rc,
 	int counted;
 	int ret;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
 
+	/*
+	 * This is an extent belonging to the free space cache, lets just delete
+	 * it and redo the search.
+	 */
+	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+		ret = delete_block_group_cache(rc->extent_root->fs_info,
+					       NULL, ref_objectid);
+		if (ret != -ENOENT)
+			return ret;
+		ret = 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
 	if (IS_ERR(root)) {
 		err = PTR_ERR(root);
@@ -3860,6 +3921,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 {
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	struct reloc_control *rc;
+	struct inode *inode;
+	struct btrfs_path *path;
 	int ret;
 	int rw = 0;
 	int err = 0;
@@ -3882,6 +3945,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		rw = 1;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+					path);
+	btrfs_free_path(path);
+
+	if (!IS_ERR(inode))
+		ret = delete_block_group_cache(fs_info, inode, 0);
+	else
+		ret = PTR_ERR(inode);
+
+	if (ret && ret != -ENOENT) {
+		err = ret;
+		goto out;
+	}
+
 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
 	if (IS_ERR(rc->data_inode)) {
 		err = PTR_ERR(rc->data_inode);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..5c23eb8d6ba3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ enum {
 	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-	Opt_discard, Opt_err,
+	Opt_discard, Opt_space_cache, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -92,6 +92,7 @@ static match_table_t tokens = {
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_discard, "discard"},
+	{Opt_space_cache, "space_cache"},
 	{Opt_err, NULL},
 };
 
@@ -235,6 +236,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_discard:
 			btrfs_set_opt(info->mount_opt, DISCARD);
 			break;
+		case Opt_space_cache:
+			printk(KERN_INFO "btrfs: enabling disk space caching\n");
+			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63b..e7144c48ed79 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
 	TRANS_START,
 	TRANS_JOIN,
 	TRANS_USERSPACE,
+	TRANS_JOIN_NOLOCK,
 };
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -186,7 +187,8 @@ again:
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_lock(&root->fs_info->trans_mutex);
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
@@ -195,7 +197,8 @@ again:
 
 	cur_trans = root->fs_info->running_transaction;
 	cur_trans->use_count++;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_unlock(&root->fs_info->trans_mutex);
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
@@ -224,9 +227,11 @@ again:
 		}
 	}
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_lock(&root->fs_info->trans_mutex);
 	record_root_in_trans(h, root);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_unlock(&root->fs_info->trans_mutex);
 
 	if (!current->journal_info && type != TRANS_USERSPACE)
 		current->journal_info = h;
@@ -244,6 +249,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 	return start_transaction(root, 0, TRANS_JOIN);
 }
 
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+							  int num_blocks)
+{
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
+
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 							 int num_blocks)
 {
@@ -348,7 +359,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, int throttle)
+			  struct btrfs_root *root, int throttle, int lock)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -376,18 +387,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 
-	if (!root->fs_info->open_ioctl_trans &&
+	if (lock && !root->fs_info->open_ioctl_trans &&
 	    should_end_transaction(trans, root))
 		trans->transaction->blocked = 1;
 
-	if (cur_trans->blocked && !cur_trans->in_commit) {
+	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
 		if (throttle)
 			return btrfs_commit_transaction(trans, root);
 		else
 			wake_up_process(info->transaction_kthread);
 	}
 
-	mutex_lock(&info->trans_mutex);
+	if (lock)
+		mutex_lock(&info->trans_mutex);
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(cur_trans->num_writers < 1);
 	cur_trans->num_writers--;
@@ -395,7 +407,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
 	put_transaction(cur_trans);
-	mutex_unlock(&info->trans_mutex);
+	if (lock)
+		mutex_unlock(&info->trans_mutex);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -411,13 +424,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 0);
+	return __btrfs_end_transaction(trans, root, 0, 1);
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 1);
+	return __btrfs_end_transaction(trans, root, 1, 1);
+}
+
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 0, 0);
 }
 
 /*
@@ -966,6 +985,8 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root = root_item->bytenr;
 	super->generation = root_item->generation;
 	super->root_level = root_item->level;
+	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+		super->cache_generation = root_item->generation;
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bfd..15f83e1c1ef7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -87,10 +87,14 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 						  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+							  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 							 int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 9343919c1495b085a4a1cf4cbada8d7888daf099 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:56 -0400
Subject: fanotify: allow fanotify to be built

We disabled the ability to build fanotify in commit 7c5347733dcc4ba0ba.
This reverts that commit and allows people to build fanotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/Kconfig    | 2 +-
 include/linux/Kbuild | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index b388443c3a09..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
 
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
-#source "fs/notify/fanotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 90e3ed3a3144..97319a8fc1e0 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -118,6 +118,7 @@ header-y += eventpoll.h
 header-y += ext2_fs.h
 header-y += fadvise.h
 header-y += falloc.h
+header-y += fanotify.h
 header-y += fb.h
 header-y += fcntl.h
 header-y += fd.h
-- 
cgit v1.2.3


From 6ad2d4e3e97ee4bfde0b45e8dfe37911330fc4aa Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:56 -0400
Subject: fsnotify: implement ordering between notifiers

fanotify needs to be able to specify that some groups get events before
others.  They use this idea to make sure that a hierarchical storage
manager gets access to files before programs which actually use them.  This
is purely infrastructure.  Everything will have a priority of 0, but the
infrastructure will exist for it to be non-zero.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inode_mark.c           | 9 +++++++--
 fs/notify/vfsmount_mark.c        | 6 +++++-
 include/linux/fsnotify_backend.h | 8 ++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 21ed10660b80..4c29fcf557d1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
  * Attach an initialized mark to a given inode.
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group and for which inodes.  These
- * marks are ordered according to the group's location in memory.
+ * marks are ordered according to priority, highest number first, and then by
+ * the group's location in memory.
  */
 int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 			    struct fsnotify_group *group, struct inode *inode,
@@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 			goto out;
 		}
 
-		if (mark->group < lmark->group)
+		if (mark->group->priority < lmark->group->priority)
+			continue;
+
+		if ((mark->group->priority == lmark->group->priority) &&
+		    (mark->group < lmark->group))
 			continue;
 
 		hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 56772b578fbd..85eebff6d0d7 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 			goto out;
 		}
 
-		if (mark->group < lmark->group)
+		if (mark->group->priority < lmark->group->priority)
+			continue;
+
+		if ((mark->group->priority == lmark->group->priority) &&
+		    (mark->group < lmark->group))
 			continue;
 
 		hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e40190d16878..825329534162 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -129,6 +129,14 @@ struct fsnotify_group {
 	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
 	unsigned int q_len;			/* events on the queue */
 	unsigned int max_events;		/* maximum events allowed on the list */
+	/*
+	 * Valid fsnotify group priorities.  Events are send in order from highest
+	 * priority to lowest priority.  We default to the lowest priority.
+	 */
+	#define FS_PRIO_0	0 /* normal notifiers, no permissions */
+	#define FS_PRIO_1	1 /* fanotify content based access control */
+	#define FS_PRIO_2	2 /* fanotify pre-content access */
+	unsigned int priority;
 
 	/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
 	spinlock_t mark_lock;		/* protect marks_list */
-- 
cgit v1.2.3


From 4231a23530a30e86eb32fbe869bbef1b3e54d5aa Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:56 -0400
Subject: fanotify: implement fanotify listener ordering

The fanotify listeners needs to be able to specify what types of operations
they are going to perform so they can be ordered appropriately between other
listeners doing other types of operations.  They need this to be able to make
sure that things like hierarchichal storage managers will get access to inodes
before processes which need the data.  This patch defines 3 possible uses
which groups must indicate in the fanotify_init() flags.

FAN_CLASS_PRE_CONTENT
FAN_CLASS_CONTENT
FAN_CLASS_NOTIF

Groups will receive notification in that order.  The order between 2 groups in
the same class is undeterministic.

FAN_CLASS_PRE_CONTENT is intended to be used by listeners which need access to
the inode before they are certain that the inode contains it's final data.  A
hierarchical storage manager should choose to use this class.

FAN_CLASS_CONTENT is intended to be used by listeners which need access to the
inode after it contains its intended contents.  This would be the appropriate
level for an AV solution or document control system.

FAN_CLASS_NOTIF is intended for normal async notification about access, much the
same as inotify and dnotify.  Syncronous permissions events are not permitted
at this class.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 25 ++++++++++++++++++++++++-
 include/linux/fanotify.h           | 11 ++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index bbcb98e7fcc6..1c09e6321c5e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -664,6 +664,20 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
 #endif
+	switch (flags & FAN_ALL_CLASS_BITS) {
+	case FAN_CLASS_NOTIF:
+		group->priority = FS_PRIO_0;
+		break;
+	case FAN_CLASS_CONTENT:
+		group->priority = FS_PRIO_1;
+		break;
+	case FAN_CLASS_PRE_CONTENT:
+		group->priority = FS_PRIO_2;
+		break;
+	default:
+		fd = -EINVAL;
+		goto out_put_group;
+	}
 
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
@@ -719,6 +733,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	ret = -EINVAL;
 	if (unlikely(filp->f_op != &fanotify_fops))
 		goto fput_and_out;
+	group = filp->private_data;
+
+	/*
+	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
+	 * allowed to set permissions events.
+	 */
+	ret = -EINVAL;
+	if (mask & FAN_ALL_PERM_EVENTS &&
+	    group->priority == FS_PRIO_0)
+		goto fput_and_out;
 
 	ret = fanotify_find_path(dfd, pathname, &path, flags);
 	if (ret)
@@ -729,7 +753,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 		inode = path.dentry->d_inode;
 	else
 		mnt = path.mnt;
-	group = filp->private_data;
 
 	/* create/update an inode mark */
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 63531a6b4d2a..2c89ce7b644e 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -25,7 +25,16 @@
 #define FAN_CLOEXEC		0x00000001
 #define FAN_NONBLOCK		0x00000002
 
-#define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK)
+/* These are NOT bitwise flags.  Both bits are used togther.  */
+#define FAN_CLASS_NOTIF		0x00000000
+#define FAN_CLASS_CONTENT	0x00000004
+#define FAN_CLASS_PRE_CONTENT	0x00000008
+
+#define FAN_ALL_CLASS_BITS	(FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \
+				 FAN_CLASS_PRE_CONTENT)
+
+#define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
+				 FAN_ALL_CLASS_BITS)
 
 /* flags used for fanotify_modify_mark() */
 #define FAN_MARK_ADD		0x00000001
-- 
cgit v1.2.3


From ff8bcbd03da881bf1171910c6c07d44bd3c0a234 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:56 -0400
Subject: fsnotify: correctly handle return codes from listeners

When fsnotify groups return errors they are ignored.  For permissions
events these should be passed back up the stack, but for most events these
should continue to be ignored.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 20 ++++++++++++--------
 include/linux/fsnotify_backend.h |  2 ++
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4498a208df94..57ecadd85abf 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -252,20 +252,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 
 		if (inode_group > vfsmount_group) {
 			/* handle inode */
-			send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
-				      data_is, cookie, file_name, &event);
+			ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+					    data_is, cookie, file_name, &event);
 			/* we didn't use the vfsmount_mark */
 			vfsmount_group = NULL;
 		} else if (vfsmount_group > inode_group) {
-			send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
-				      data_is, cookie, file_name, &event);
+			ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+					    data_is, cookie, file_name, &event);
 			inode_group = NULL;
 		} else {
-			send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
-				      mask, data, data_is, cookie, file_name,
-				      &event);
+			ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+					    mask, data, data_is, cookie, file_name,
+					    &event);
 		}
 
+		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
+			goto out;
+
 		if (inode_group)
 			inode_node = srcu_dereference(inode_node->next,
 						      &fsnotify_mark_srcu);
@@ -273,7 +276,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 			vfsmount_node = srcu_dereference(vfsmount_node->next,
 							 &fsnotify_mark_srcu);
 	}
-
+	ret = 0;
+out:
 	srcu_read_unlock(&fsnotify_mark_srcu, idx);
 	/*
 	 * fsnotify_create_event() took a reference so the event can't be cleaned
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 825329534162..026892187c83 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -64,6 +64,8 @@
 
 #define FS_MOVE			(FS_MOVED_FROM | FS_MOVED_TO)
 
+#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM)
+
 #define ALL_FSNOTIFY_EVENTS (FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
 			     FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN | \
 			     FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE | \
-- 
cgit v1.2.3


From 52420392c81c8712f555e6bcd116d8bd214ce43a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:56 -0400
Subject: fsnotify: call fsnotify_parent in perm events

fsnotify perm events do not call fsnotify parent.  That means you cannot
register a perm event on a directory and enforce permissions on all inodes in
that directory.  This patch fixes that situation.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 15 +++++++++------
 include/linux/fsnotify.h         |  9 +++++++--
 include/linux/fsnotify_backend.h |  8 +++++---
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 57ecadd85abf..20dc218707ca 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,16 +84,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
+	int ret = 0;
 
 	if (!dentry)
 		dentry = path->dentry;
 
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
-		return;
+		return 0;
 
 	parent = dget_parent(dentry);
 	p_inode = parent->d_inode;
@@ -106,14 +107,16 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 		mask |= FS_EVENT_ON_CHILD;
 
 		if (path)
-			fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
-				 dentry->d_name.name, 0);
+			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+				       dentry->d_name.name, 0);
 		else
-			fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-				 dentry->d_name.name, 0);
+			ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+				       dentry->d_name.name, 0);
 	}
 
 	dput(parent);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 59d0df43ff9d..5059faacceab 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -26,12 +26,12 @@ static inline void fsnotify_d_instantiate(struct dentry *dentry,
 }
 
 /* Notify this dentry's parent about a child's events. */
-static inline void fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+static inline int fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	if (!dentry)
 		dentry = path->dentry;
 
-	__fsnotify_parent(path, dentry, mask);
+	return __fsnotify_parent(path, dentry, mask);
 }
 
 /* simple call site for access decisions */
@@ -40,6 +40,7 @@ static inline int fsnotify_perm(struct file *file, int mask)
 	struct path *path = &file->f_path;
 	struct inode *inode = path->dentry->d_inode;
 	__u32 fsnotify_mask = 0;
+	int ret;
 
 	if (file->f_mode & FMODE_NONOTIFY)
 		return 0;
@@ -52,6 +53,10 @@ static inline int fsnotify_perm(struct file *file, int mask)
 	else
 		BUG();
 
+	ret = fsnotify_parent(path, NULL, fsnotify_mask);
+	if (ret)
+		return ret;
+
 	return fsnotify(inode, fsnotify_mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 026892187c83..b37f3a71a9dc 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -304,7 +304,7 @@ struct fsnotify_mark {
 /* main fsnotify call to send events */
 extern int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		    const unsigned char *name, u32 cookie);
-extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
+extern int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
 extern u32 fsnotify_get_cookie(void);
@@ -433,8 +433,10 @@ static inline int fsnotify(struct inode *to_tell, __u32 mask, void *data, int da
 	return 0;
 }
 
-static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
-{}
+static inline int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+{
+	return 0;
+}
 
 static inline void __fsnotify_inode_delete(struct inode *inode)
 {}
-- 
cgit v1.2.3


From 5322a59f14e4cae5f878b9c0c5612d403c230d7f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:57 -0400
Subject: fanotify: ignore fanotify ignore marks if open writers

fanotify will clear ignore marks if a task changes the contents of an
inode.  The problem is with the races around when userspace finishes
checking a file and when that result is actually attached to the inode.
This race was described as such:

Consider the following scenario with hostile processes A and B, and
victim process C:
1. Process A opens new file for writing. File check request is generated.
2. File check is performed in userspace. Check result is "file has no malware".
3. The "permit" response is delivered to kernel space.
4. File ignored mark set.
5. Process A writes dummy bytes to the file. File ignored flags are cleared.
6. Process B opens the same file for reading. File check request is generated.
7. File check is performed in userspace. Check result is "file has no malware".
8. Process A writes malware bytes to the file. There is no cached response yet.
9. The "permit" response is delivered to kernel space and is cached in fanotify.
10. File ignored mark set.
11. Now any process C will be permitted to open the malware file.
There is a race between steps 8 and 10

While fanotify makes no strong guarantees about systems with hostile
processes there is no reason we cannot harden against this race.  We do
that by simply ignoring any ignore marks if the inode has open writers (aka
i_writecount > 0).  (We actually do not ignore ignore marks if the
FAN_MARK_SURV_MODIFY flag is set)

Reported-by: Vasily Novikov <vasily.novikov@kaspersky.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1c09e6321c5e..b265936e92d6 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -610,6 +610,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
+	/*
+	 * If some other task has this inode open for write we should not add
+	 * an ignored mark, unless that ignored mark is supposed to survive
+	 * modification changes anyway.
+	 */
+	if ((flags & FAN_MARK_IGNORED_MASK) &&
+	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	    (atomic_read(&inode->i_writecount) > 0))
+		return 0;
+
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
 		int ret;
-- 
cgit v1.2.3


From 2529a0df0f64dab1f60ae08e038b89c53a6b4c02 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:57 -0400
Subject: fsnotify: implement a default maximum queue depth

Currently fanotify has no maximum queue depth.  Since fanotify is
CAP_SYS_ADMIN only this does not pose a normal user DoS issue, but it
certianly is possible that an fanotify listener which can't keep up could
OOM the box.  This patch implements a default 16k depth.  This is the same
default depth used by inotify, but given fanotify's better queue merging in
many situations this queue will contain many additional useful events by
comparison.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 4 ++++
 include/linux/fanotify.h           | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b265936e92d6..04f2fe47b66a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,8 @@
 
 #include <asm/ioctls.h>
 
+#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
+
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
@@ -689,6 +691,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		goto out_put_group;
 	}
 
+	group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_put_group;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index a97c96d28c07..ed479b6fef7b 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -12,7 +12,6 @@
 
 #define FAN_EVENT_ON_CHILD	0x08000000	/* interested in child events */
 
-/* FIXME currently Q's have no limit.... */
 #define FAN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
 
 #define FAN_OPEN_PERM		0x00010000	/* File open in perm check */
-- 
cgit v1.2.3


From 5dd03f55fd2f21916ce248bb2e68bbfb39d94fe5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:57 -0400
Subject: fanotify: allow userspace to override max queue depth

fanotify has a defualt max queue depth.  This patch allows processes which
explicitly request it to have an 'unlimited' queue depth.  These processes
need to be very careful to make sure they cannot fall far enough behind
that they OOM the box.  Thus this flag is gated on CAP_SYS_ADMIN.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 9 ++++++++-
 include/linux/fanotify.h           | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 04f2fe47b66a..43d66d9b2eff 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -691,7 +691,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		goto out_put_group;
 	}
 
-	group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+	if (flags & FAN_UNLIMITED_QUEUE) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_put_group;
+		group->max_events = UINT_MAX;
+	} else {
+		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+	}
 
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index ed479b6fef7b..e37f559c95e1 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -28,12 +28,13 @@
 #define FAN_CLASS_NOTIF		0x00000000
 #define FAN_CLASS_CONTENT	0x00000004
 #define FAN_CLASS_PRE_CONTENT	0x00000008
-
 #define FAN_ALL_CLASS_BITS	(FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \
 				 FAN_CLASS_PRE_CONTENT)
 
+#define FAN_UNLIMITED_QUEUE	0x00000010
+
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
-				 FAN_ALL_CLASS_BITS)
+				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE)
 
 /* flags used for fanotify_modify_mark() */
 #define FAN_MARK_ADD		0x00000001
-- 
cgit v1.2.3


From e7099d8a5a34d2876908a9fab4952dabdcfc5909 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:57 -0400
Subject: fanotify: limit the number of marks in a single fanotify group

There is currently no limit on the number of marks a given fanotify group
can have.  Since fanotify is gated on CAP_SYS_ADMIN this was not seen as
a serious DoS threat.  This patch implements a default of 8192, the same as
inotify to work towards removing the CAP_SYS_ADMIN gating and eliminating
the default DoS'able status.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 9 +++++++++
 include/linux/fsnotify_backend.h   | 1 +
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 43d66d9b2eff..1d33d7db277a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,7 @@
 #include <asm/ioctls.h>
 
 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
+#define FANOTIFY_DEFAULT_MAX_MARKS	8192
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
@@ -584,6 +585,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 	if (!fsn_mark) {
 		int ret;
 
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+			return -ENOSPC;
+
 		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
 		if (!fsn_mark)
 			return -ENOMEM;
@@ -626,6 +630,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	if (!fsn_mark) {
 		int ret;
 
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+			return -ENOSPC;
+
 		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
 		if (!fsn_mark)
 			return -ENOMEM;
@@ -700,6 +707,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
 	}
 
+	group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_put_group;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b37f3a71a9dc..49ceed6e92b1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -169,6 +169,7 @@ struct fsnotify_group {
 			bool bypass_perm; /* protected by access_mutex */
 #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
 			int f_flags;
+			unsigned int max_marks;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
-- 
cgit v1.2.3


From ac7e22dcfafd04c842a02057afd6541c1d613ef9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:58 -0400
Subject: fanotify: allow userspace to override max marks

Some fanotify groups, especially those like AV scanners, will need to place
lots of marks, particularly ignore marks.  Since ignore marks do not pin
inodes in cache and are cleared if the inode is removed from core (usually
under memory pressure) we expose an interface for listeners, with
CAP_SYS_ADMIN, to override the maximum number of marks and be allowed to
set and 'unlimited' number of marks.  Programs which make use of this
feature will be able to OOM a machine.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 9 ++++++++-
 include/linux/fanotify.h           | 4 +++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1d33d7db277a..f9216102b426 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -707,7 +707,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
 	}
 
-	group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+	if (flags & FAN_UNLIMITED_MARKS) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_put_group;
+		group->fanotify_data.max_marks = UINT_MAX;
+	} else {
+		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+	}
 
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index e37f559c95e1..7592a366a57b 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -32,9 +32,11 @@
 				 FAN_CLASS_PRE_CONTENT)
 
 #define FAN_UNLIMITED_QUEUE	0x00000010
+#define FAN_UNLIMITED_MARKS	0x00000020
 
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
-				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE)
+				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE |\
+				 FAN_UNLIMITED_MARKS)
 
 /* flags used for fanotify_modify_mark() */
 #define FAN_MARK_ADD		0x00000001
-- 
cgit v1.2.3


From 4afeff8505cb8a38e36c1ef2bd3447c4b8f87367 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:58 -0400
Subject: fanotify: limit number of listeners per user

fanotify currently has no limit on the number of listeners a given user can
have open.  This patch limits the total number of listeners per user to
128.  This is the same as the inotify default limit.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c      | 11 ++++++++++-
 fs/notify/fanotify/fanotify_user.c | 11 +++++++++++
 include/linux/fsnotify_backend.h   |  1 +
 include/linux/sched.h              |  3 +++
 4 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 85366c78cc37..60c11c306fd9 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -200,10 +200,19 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 	return false;
 }
 
+static void fanotify_free_group_priv(struct fsnotify_group *group)
+{
+	struct user_struct *user;
+
+	user = group->fanotify_data.user;
+	atomic_dec(&user->fanotify_listeners);
+	free_uid(user);
+}
+
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.should_send_event = fanotify_should_send_event,
-	.free_group_priv = NULL,
+	.free_group_priv = fanotify_free_group_priv,
 	.free_event_priv = NULL,
 	.freeing_mark = NULL,
 };
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f9216102b426..a7d9369482d5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -18,6 +18,7 @@
 
 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
 #define FANOTIFY_DEFAULT_MAX_MARKS	8192
+#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
@@ -656,6 +657,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 {
 	struct fsnotify_group *group;
 	int f_flags, fd;
+	struct user_struct *user;
 
 	pr_debug("%s: flags=%d event_f_flags=%d\n",
 		__func__, flags, event_f_flags);
@@ -666,6 +668,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (flags & ~FAN_ALL_INIT_FLAGS)
 		return -EINVAL;
 
+	user = get_current_user();
+	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+		free_uid(user);
+		return -EMFILE;
+	}
+
 	f_flags = O_RDWR | FMODE_NONOTIFY;
 	if (flags & FAN_CLOEXEC)
 		f_flags |= O_CLOEXEC;
@@ -677,6 +685,9 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (IS_ERR(group))
 		return PTR_ERR(group);
 
+	group->fanotify_data.user = user;
+	atomic_inc(&user->fanotify_listeners);
+
 	group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	mutex_init(&group->fanotify_data.access_mutex);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 49ceed6e92b1..4366f458a86a 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -170,6 +170,7 @@ struct fsnotify_group {
 #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
 			int f_flags;
 			unsigned int max_marks;
+			struct user_struct *user;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index be7adb7588e5..6f420baf37ca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -672,6 +672,9 @@ struct user_struct {
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
+#ifdef CONFIG_FANOTIFY
+	atomic_t fanotify_listeners;
+#endif
 #ifdef CONFIG_EPOLL
 	atomic_t epoll_watches;	/* The number of file descriptors currently watched */
 #endif
-- 
cgit v1.2.3


From e1c048ba786789afdc66f32d8394bb5a0014bbba Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:58 -0400
Subject: fanotify: do not send events for irregular files

fanotify_should_send_event has a test to see if an object is a file or
directory and does not send an event otherwise.  The problem is that the
test is actually checking if the object with a mark is a file or directory,
not if the object the event happened on is a file or directory.  We should
check the latter.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 60c11c306fd9..8d98e1f5817b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -160,20 +160,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 				       __u32 event_mask, void *data, int data_type)
 {
 	__u32 marks_mask, marks_ignored_mask;
+	struct path *path = data;
 
 	pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
 		 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
 		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
 
-	/* sorry, fanotify only gives a damn about files and dirs */
-	if (!S_ISREG(to_tell->i_mode) &&
-	    !S_ISDIR(to_tell->i_mode))
-		return false;
-
 	/* if we don't have enough info to send an event to userspace say no */
 	if (data_type != FSNOTIFY_EVENT_PATH)
 		return false;
 
+	/* sorry, fanotify only gives a damn about files and dirs */
+	if (!S_ISREG(path->dentry->d_inode->i_mode) &&
+	    !S_ISDIR(path->dentry->d_inode->i_mode))
+		return false;
+
 	if (inode_mark && vfsmnt_mark) {
 		marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
 		marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
-- 
cgit v1.2.3


From b29866aab8489487f11cc4506590ac31bdbae22a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:58 -0400
Subject: fsnotify: rename FS_IN_ISDIR to FS_ISDIR

The _IN_ in the naming is reserved for flags only used by inotify.  Since I
am about to use this flag for fanotify rename it to be generic like the
rest.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c |  2 +-
 include/linux/fsnotify.h         | 20 ++++++++++----------
 include/linux/fsnotify_backend.h |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 24edc1185d53..444c305a468c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -862,7 +862,7 @@ static int __init inotify_user_setup(void)
 	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
 	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
 	BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
-	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+	BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
 	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
 
 	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 5059faacceab..ecb43b33d181 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -98,8 +98,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		old_dir_mask |= FS_DN_RENAME;
 
 	if (isdir) {
-		old_dir_mask |= FS_IN_ISDIR;
-		new_dir_mask |= FS_IN_ISDIR;
+		old_dir_mask |= FS_ISDIR;
+		new_dir_mask |= FS_ISDIR;
 	}
 
 	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
@@ -137,7 +137,7 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 	__u32 mask = FS_DELETE;
 
 	if (isdir)
-		mask |= FS_IN_ISDIR;
+		mask |= FS_ISDIR;
 
 	fsnotify_parent(NULL, dentry, mask);
 }
@@ -179,7 +179,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
  */
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
-	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
+	__u32 mask = (FS_CREATE | FS_ISDIR);
 	struct inode *d_inode = dentry->d_inode;
 
 	audit_inode_child(dentry, inode);
@@ -197,7 +197,7 @@ static inline void fsnotify_access(struct file *file)
 	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= FS_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
 		fsnotify_parent(path, NULL, mask);
@@ -215,7 +215,7 @@ static inline void fsnotify_modify(struct file *file)
 	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= FS_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
 		fsnotify_parent(path, NULL, mask);
@@ -233,7 +233,7 @@ static inline void fsnotify_open(struct file *file)
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= FS_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
 		fsnotify_parent(path, NULL, mask);
@@ -252,7 +252,7 @@ static inline void fsnotify_close(struct file *file)
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= FS_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
 		fsnotify_parent(path, NULL, mask);
@@ -269,7 +269,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	__u32 mask = FS_ATTRIB;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= FS_IN_ISDIR;
+		mask |= FS_ISDIR;
 
 	fsnotify_parent(NULL, dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
@@ -304,7 +304,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 
 	if (mask) {
 		if (S_ISDIR(inode->i_mode))
-			mask |= FS_IN_ISDIR;
+			mask |= FS_ISDIR;
 
 		fsnotify_parent(NULL, dentry, mask);
 		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4366f458a86a..b36041e9cd34 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -45,7 +45,7 @@
 #define FS_ACCESS_PERM		0x00020000	/* access event in a permissions hook */
 
 #define FS_EXCL_UNLINK		0x04000000	/* do not send events if object is unlinked */
-#define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
+#define FS_ISDIR		0x40000000	/* event occurred against dir */
 #define FS_IN_ONESHOT		0x80000000	/* only send event once */
 
 #define FS_DN_RENAME		0x10000000	/* file renamed */
@@ -72,7 +72,7 @@
 			     FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \
 			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
 			     FS_OPEN_PERM | FS_ACCESS_PERM | FS_EXCL_UNLINK | \
-			     FS_IN_ISDIR | FS_IN_ONESHOT | FS_DN_RENAME | \
+			     FS_ISDIR | FS_IN_ONESHOT | FS_DN_RENAME | \
 			     FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)
 
 struct fsnotify_group;
-- 
cgit v1.2.3


From 8fcd65280abc4699510f1853ede31f43e8a3783a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:59 -0400
Subject: fanotify: ignore events on directories unless specifically requested

fanotify has a very limited number of events it sends on directories.  The
usefulness of these events is yet to be seen and still we send them.  This
is particularly painful for mount marks where one might receive many of
these useless events.  As such this patch will drop events on IS_DIR()
inodes unless they were explictly requested with FAN_ON_DIR.

This means that a mark on a directory without FAN_EVENT_ON_CHILD or
FAN_ON_DIR is meaningless and will result in no events ever (although it
will still be allowed since detecting it is hard)

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c      |  5 +++++
 fs/notify/fanotify/fanotify_user.c | 12 ++++++++++++
 include/linux/fanotify.h           | 10 ++++++++--
 3 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 8d98e1f5817b..b04f88eed09e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -131,6 +131,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
 	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -195,6 +196,10 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 		BUG();
 	}
 
+	if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+	    (marks_ignored_mask & FS_ISDIR))
+		return false;
+
 	if (event_mask & marks_mask & ~marks_ignored_mask)
 		return true;
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a7d9369482d5..ff1a908c9708 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -570,6 +570,12 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 	}
+
+	if (!(flags & FAN_MARK_ONDIR)) {
+		__u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+	}
+
 	spin_unlock(&fsn_mark->lock);
 
 	return mask & ~oldmask;
@@ -766,6 +772,12 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	default:
 		return -EINVAL;
 	}
+
+	if (mask & FAN_ONDIR) {
+		flags |= FAN_MARK_ONDIR;
+		mask &= ~FAN_ONDIR;
+	}
+
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
 #else
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 7592a366a57b..5e0400a80c33 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -10,13 +10,15 @@
 #define FAN_CLOSE_NOWRITE	0x00000010	/* Writtable file closed */
 #define FAN_OPEN		0x00000020	/* File was opened */
 
-#define FAN_EVENT_ON_CHILD	0x08000000	/* interested in child events */
-
 #define FAN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
 
 #define FAN_OPEN_PERM		0x00010000	/* File open in perm check */
 #define FAN_ACCESS_PERM		0x00020000	/* File accessed in perm check */
 
+#define FAN_ONDIR		0x40000000	/* event occurred against dir */
+
+#define FAN_EVENT_ON_CHILD	0x08000000	/* interested in child events */
+
 /* helper events */
 #define FAN_CLOSE		(FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
 
@@ -47,6 +49,10 @@
 #define FAN_MARK_IGNORED_MASK	0x00000020
 #define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
 #define FAN_MARK_FLUSH		0x00000080
+#ifdef __KERNEL__
+/* not valid from userspace, only kernel internal */
+#define FAN_MARK_ONDIR		0x00000100
+#endif
 
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
-- 
cgit v1.2.3


From 192ca4d1941228e69c1fbeebab317725407e6e65 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Oct 2010 17:21:59 -0400
Subject: fanotify: do not recalculate the mask if the ignored mask changed

If fanotify sets a new bit in the ignored mask it will cause the generic
fsnotify layer to recalculate the real mask.  This is stupid since we
didn't change that part.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ff1a908c9708..fa71d5dfd102 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -558,15 +558,15 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 				       __u32 mask,
 				       unsigned int flags)
 {
-	__u32 oldmask;
+	__u32 oldmask = -1;
 
 	spin_lock(&fsn_mark->lock);
 	if (!(flags & FAN_MARK_IGNORED_MASK)) {
 		oldmask = fsn_mark->mask;
 		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
 	} else {
-		oldmask = fsn_mark->ignored_mask;
-		fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
+		__u32 tmask = fsn_mark->ignored_mask | mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 	}
-- 
cgit v1.2.3


From 19ba54f4645f8c5edae4b08919a37a409b8793aa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 28 Oct 2010 17:21:59 -0400
Subject: fs/notify/fanotify/fanotify_user.c: fix warnings

fs/notify/fanotify/fanotify_user.c: In function 'fanotify_release':
fs/notify/fanotify/fanotify_user.c:375: warning: unused variable 'lre'
fs/notify/fanotify/fanotify_user.c:375: warning: unused variable 're'

this is really ugly.

Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index fa71d5dfd102..fce66dfbf7d5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -376,11 +376,10 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
-	struct fanotify_response_event *re, *lre;
-
-	pr_debug("%s: file=%p group=%p\n", __func__, file, group);
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response_event *re, *lre;
+
 	mutex_lock(&group->fanotify_data.access_mutex);
 
 	group->fanotify_data.bypass_perm = true;
-- 
cgit v1.2.3


From b1142e8fec6a594723e5054055a7b53379b90490 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 28 Oct 2010 17:33:57 -0400
Subject: ext4: BUG_ON fix: check if page has buffers before calling
 page_buffers()

We need to make check if a page does not have buffes by checking
page_has_buffers(page) before calling page_buffers(page) in
ext4_writepage().  Otherwise page_buffers() could throw a BUG_ON.

Thanks also to Markus Trippelsdorf and Avinash Kurup who also reported
the problem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Sedat Dilek <sedat.dilek@googlemail.com>
Tested-by: Sedat Dilek <sedat.dilek@googlemail.com>
---
 fs/ext4/inode.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2d6c6c8c036d..191616470466 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2718,7 +2718,7 @@ static int ext4_writepage(struct page *page,
 	 * try to create them using __block_write_begin.  If this
 	 * fails, redirty the page and move on.
 	 */
-	if (!page_buffers(page)) {
+	if (!page_has_buffers(page)) {
 		if (__block_write_begin(page, 0, len,
 					noalloc_get_block_write)) {
 		redirty_page:
@@ -2732,12 +2732,10 @@ static int ext4_writepage(struct page *page,
 	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 			      ext4_bh_delay_or_unwritten)) {
 		/*
-		 * We don't want to do block allocation So redirty the
-		 * page and return We may reach here when we do a
-		 * journal commit via
-		 * journal_submit_inode_data_buffers.  If we don't
-		 * have mapping block we just ignore them. We can also
-		 * reach here via shrink_page_list
+		 * We don't want to do block allocation, so redirty
+		 * the page and return.  We may reach here when we do
+		 * a journal commit via journal_submit_inode_data_buffers.
+		 * We can also reach here via shrink_page_list
 		 */
 		goto redirty_page;
 	}
-- 
cgit v1.2.3


From d3ba50b17aa7a391bb5b3dcd8d6ba7a02c4f031c Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Wed, 27 Oct 2010 15:20:36 -0500
Subject: NTLM auth and sign - Use appropriate server challenge

Need to have cryptkey or server challenge in smb connection
(struct TCP_Server_Info) for ntlm and ntlmv2 auth types for which
cryptkey (Encryption Key) is supplied just once in Negotiate Protocol
response during an smb connection setup for all the smb sessions over
that smb connection.

For ntlmssp, cryptkey or server challenge is provided for every
smb session in type 2 packet of ntlmssp negotiation, the cryptkey
provided during Negotiation Protocol response before smb connection
does not count.

Rename cryptKey to cryptkey and related changes.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 10 +++++++---
 fs/cifs/cifsglob.h    |  3 ++-
 fs/cifs/cifssmb.c     |  4 ++--
 fs/cifs/connect.c     |  4 ++--
 fs/cifs/sess.c        | 12 ++++++++----
 5 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 17d603ad5e34..ef95a272f73d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -249,7 +249,7 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
 	}
 	ses->auth_key.len = temp_len;
 
-	SMBNTencrypt(ses->password, ses->cryptKey,
+	SMBNTencrypt(ses->password, ses->server->cryptkey,
 			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
 
 	E_md4hash(ses->password, temp_key);
@@ -537,8 +537,12 @@ CalcNTLMv2_response(const struct cifsSesInfo *ses)
 		return rc;
 	}
 
-	memcpy(ses->auth_key.response + offset,
-		ses->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
+	if (ses->server->secType == RawNTLMSSP)
+		memcpy(ses->auth_key.response + offset,
+			ses->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	else
+		memcpy(ses->auth_key.response + offset,
+			ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
 	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 		ses->auth_key.response + offset, ses->auth_key.len - offset);
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 67d6a2280a01..b73695176467 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -196,6 +196,7 @@ struct TCP_Server_Info {
 	int capabilities; /* allow selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u16 CurrentMid;         /* multiplex id - rotating counter */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* needed for CIFS PDU signature */
@@ -240,7 +241,7 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
-	char cryptKey[CIFS_CRYPTO_KEY_SIZE];
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 	struct session_key auth_key;
 	char ntlmv2_hash[16];
 	unsigned int tilen; /* length of the target info blob */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e98f1f317b15..2f2632b6df5a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 		if (rsp->EncryptionKeyLength ==
 				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-			memcpy(ses->cryptKey, rsp->EncryptionKey,
+			memcpy(ses->server->cryptkey, rsp->EncryptionKey,
 				CIFS_CRYPTO_KEY_SIZE);
 		} else if (server->secMode & SECMODE_PW_ENCRYPT) {
 			rc = -EIO; /* need cryptkey unless plain text */
@@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
 	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-		memcpy(ses->cryptKey, pSMBr->u.EncryptionKey,
+		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
 	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
 			&& (pSMBr->EncryptionKeyLength == 0)) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 469c3ddba463..4d8004ce5834 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3002,13 +3002,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
 		    (ses->server->secType == LANMAN))
-			calc_lanman_hash(tcon->password, ses->cryptKey,
+			calc_lanman_hash(tcon->password, ses->server->cryptkey,
 					 ses->server->secMode &
 					    SECMODE_PW_ENCRYPT ? true : false,
 					 bcc_ptr);
 		else
 #endif /* CIFS_WEAK_PW_HASH */
-		SMBNTencrypt(tcon->password, ses->cryptKey, bcc_ptr);
+		SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
 
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index e0515a62715d..f74c5a88dd4c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -399,7 +399,7 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 		return -EINVAL;
 	}
 
-	memcpy(ses->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	memcpy(ses->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
 	/* BB we could decode pblob->NegotiateFlags; some may be useful */
 	/* In particular we can examine sign flags */
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
@@ -667,10 +667,14 @@ ssetup_ntlmssp_authenticate:
 		/* no capabilities flags in old lanman negotiation */
 
 		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-		/* BB calculate hash with password */
-		/* and copy into bcc */
 
-		calc_lanman_hash(ses->password, ses->cryptKey,
+		/* Calculate hash with password and copy into bcc_ptr.
+		 * Encryption Key (stored as in cryptkey) gets used if the
+		 * security mode bit in Negottiate Protocol response states
+		 * to use challenge/response method (i.e. Password bit is 1).
+		 */
+
+		calc_lanman_hash(ses->password, ses->server->cryptkey,
 				 ses->server->secMode & SECMODE_PW_ENCRYPT ?
 					true : false, lnm_session_key);
 
-- 
cgit v1.2.3


From d3686d54c7902a303bd65d751226aa1647319863 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 28 Oct 2010 09:53:07 -0500
Subject: cifs: Cleanup and thus reduce smb session structure and fields used
 during authentication

Removed following fields from smb session structure
 cryptkey, ntlmv2_hash, tilen, tiblob
and ntlmssp_auth structure is allocated dynamically only if the auth mech
in NTLMSSP.

response field within a session_key structure is used to initially store the
target info (either plucked from type 2 challenge packet in case of NTLMSSP
or fabricated in case of NTLMv2 without extended security) and then to store
Message Authentication Key (mak) (session key + client response).

Server challenge or cryptkey needed during a NTLMSSP authentication
is now part of ntlmssp_auth structure which gets allocated and freed
once authenticaiton process is done.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 53 ++++++++++++++++++++++++++++-----------------------
 fs/cifs/cifsglob.h    |  9 +++------
 fs/cifs/connect.c     | 11 +++++------
 fs/cifs/sess.c        | 29 +++++++++++++++++-----------
 4 files changed, 55 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index ef95a272f73d..f856732161ab 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -328,15 +328,15 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	 * two times the unicode length of a server name +
 	 * size of a timestamp (which is 8 bytes).
 	 */
-	ses->tilen = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
-	ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL);
-	if (!ses->tiblob) {
-		ses->tilen = 0;
+	ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+	ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		ses->auth_key.len = 0;
 		cERROR(1, "Challenge target info allocation failure");
 		return -ENOMEM;
 	}
 
-	blobptr = ses->tiblob;
+	blobptr = ses->auth_key.response;
 	attrptr = (struct ntlmssp2_name *) blobptr;
 
 	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
@@ -400,11 +400,11 @@ find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	unsigned char *blobend;
 	struct ntlmssp2_name *attrptr;
 
-	if (!ses->tilen || !ses->tiblob)
+	if (!ses->auth_key.len || !ses->auth_key.response)
 		return 0;
 
-	blobptr = ses->tiblob;
-	blobend = ses->tiblob + ses->tilen;
+	blobptr = ses->auth_key.response;
+	blobend = blobptr + ses->auth_key.len;
 
 	while (blobptr + onesize < blobend) {
 		attrptr = (struct ntlmssp2_name *) blobptr;
@@ -436,7 +436,7 @@ find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	return 0;
 }
 
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
 			    const struct nls_table *nls_cp)
 {
 	int rc = 0;
@@ -509,7 +509,7 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 	}
 
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
-					ses->ntlmv2_hash);
+					ntlmv2_hash);
 
 calc_exit_1:
 	kfree(user);
@@ -518,7 +518,7 @@ calc_exit_2:
 }
 
 static int
-CalcNTLMv2_response(const struct cifsSesInfo *ses)
+CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
 {
 	int rc;
 	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
@@ -529,7 +529,7 @@ CalcNTLMv2_response(const struct cifsSesInfo *ses)
 	}
 
 	crypto_shash_setkey(ses->server->secmech.hmacmd5,
-				ses->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+				ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
@@ -539,7 +539,7 @@ CalcNTLMv2_response(const struct cifsSesInfo *ses)
 
 	if (ses->server->secType == RawNTLMSSP)
 		memcpy(ses->auth_key.response + offset,
-			ses->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+			ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
 	else
 		memcpy(ses->auth_key.response + offset,
 			ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
@@ -558,7 +558,10 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
 	int rc;
 	int baselen;
+	unsigned int tilen;
 	struct ntlmv2_resp *buf;
+	char ntlmv2_hash[16];
+	unsigned char *tiblob = NULL; /* target info blob */
 
 	if (ses->server->secType == RawNTLMSSP) {
 		if (!ses->domainName) {
@@ -572,18 +575,22 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 		rc = build_avpair_blob(ses, nls_cp);
 		if (rc) {
 			cERROR(1, "error %d building av pair blob", rc);
-			return rc;
+			goto setup_ntlmv2_rsp_ret;
 		}
 	}
 
 	baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
-	ses->auth_key.len = baselen + ses->tilen;
-	ses->auth_key.response = kmalloc(ses->auth_key.len, GFP_KERNEL);
+	tilen = ses->auth_key.len;
+	tiblob = ses->auth_key.response;
+
+	ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
 	if (!ses->auth_key.response) {
 		rc = ENOMEM;
+		ses->auth_key.len = 0;
 		cERROR(1, "%s: Can't allocate auth blob", __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
+	ses->auth_key.len += baselen;
 
 	buf = (struct ntlmv2_resp *)
 			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
@@ -593,17 +600,17 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
 	buf->reserved2 = 0;
 
-	memcpy(ses->auth_key.response + baselen, ses->tiblob, ses->tilen);
+	memcpy(ses->auth_key.response + baselen, tiblob, tilen);
 
 	/* calculate ntlmv2_hash */
-	rc = calc_ntlmv2_hash(ses, nls_cp);
+	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
 	if (rc) {
 		cERROR(1, "could not get v2 hash rc %d", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
 	/* calculate first part of the client response (CR1) */
-	rc = CalcNTLMv2_response(ses);
+	rc = CalcNTLMv2_response(ses, ntlmv2_hash);
 	if (rc) {
 		cERROR(1, "Could not calculate CR1  rc: %d", rc);
 		goto setup_ntlmv2_rsp_ret;
@@ -611,7 +618,7 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 
 	/* now calculate the session key for NTLMv2 */
 	crypto_shash_setkey(ses->server->secmech.hmacmd5,
-		ses->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+		ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
@@ -627,9 +634,7 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 		ses->auth_key.response);
 
 setup_ntlmv2_rsp_ret:
-	kfree(ses->tiblob);
-	ses->tiblob = NULL;
-	ses->tilen = 0;
+	kfree(tiblob);
 
 	return rc;
 }
@@ -657,7 +662,7 @@ calc_seckey(struct cifsSesInfo *ses)
 					CIFS_SESS_KEY_SIZE);
 
 	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
-	sg_init_one(&sgout, ses->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+	sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
 
 	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
 	if (rc) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b73695176467..f259e4d7612d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -119,11 +119,12 @@ struct cifs_secmech {
 	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
 };
 
-/* per smb connection structure/fields */
+/* per smb session structure/fields */
 struct ntlmssp_auth {
 	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
 	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
 	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 };
 
 struct cifs_cred {
@@ -241,12 +242,8 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
-	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 	struct session_key auth_key;
-	char ntlmv2_hash[16];
-	unsigned int tilen; /* length of the target info blob */
-	unsigned char *tiblob; /* target info blob in challenge response */
-	struct ntlmssp_auth ntlmssp; /* ciphertext, flags */
+	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4d8004ce5834..9eb327defa1d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1818,8 +1818,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	if (ses == NULL)
 		goto get_ses_fail;
 
-	ses->tilen = 0;
-	ses->tiblob = NULL;
 	/* new SMB session uses our server ref */
 	ses->server = server;
 	if (server->addr.sockAddr6.sin6_family == AF_INET6)
@@ -1840,10 +1838,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 			goto get_ses_fail;
 	}
 	if (volume_info->domainname) {
-		int len = strlen(volume_info->domainname);
-		ses->domainName = kmalloc(len + 1, GFP_KERNEL);
-		if (ses->domainName)
-			strcpy(ses->domainName, volume_info->domainname);
+		ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
+		if (!ses->domainName)
+			goto get_ses_fail;
 	}
 	ses->cred_uid = volume_info->cred_uid;
 	ses->linux_uid = volume_info->linux_uid;
@@ -3213,6 +3210,8 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 	kfree(ses->auth_key.response);
 	ses->auth_key.response = NULL;
 	ses->auth_key.len = 0;
+	kfree(ses->ntlmssp);
+	ses->ntlmssp = NULL;
 
 	return rc;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index f74c5a88dd4c..7b01d3f6eed6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -399,23 +399,22 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 		return -EINVAL;
 	}
 
-	memcpy(ses->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
 	/* BB we could decode pblob->NegotiateFlags; some may be useful */
 	/* In particular we can examine sign flags */
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
-	ses->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags);
+	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
 	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
 	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
-	ses->tilen = tilen;
-	if (ses->tilen) {
-		ses->tiblob = kmalloc(tilen, GFP_KERNEL);
-		if (!ses->tiblob) {
+	if (tilen) {
+		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+		if (!ses->auth_key.response) {
 			cERROR(1, "Challenge target info allocation failure");
-			ses->tilen = 0;
 			return -ENOMEM;
 		}
-		memcpy(ses->tiblob,  bcc_ptr + tioffset, ses->tilen);
+		memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
+		ses->auth_key.len = tilen;
 	}
 
 	return 0;
@@ -545,9 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->WorkstationName.MaximumLength = 0;
 	tmp += 2;
 
-	if ((ses->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+	if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
 			!calc_seckey(ses)) {
-		memcpy(tmp, ses->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+		memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
 		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
 		sec_blob->SessionKey.MaximumLength =
@@ -601,8 +600,16 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 		return -EINVAL;
 
 	type = ses->server->secType;
-
 	cFYI(1, "sess setup type %d", type);
+	if (type == RawNTLMSSP) {
+		/* if memory allocation is successful, caller of this function
+		 * frees it.
+		 */
+		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+		if (!ses->ntlmssp)
+			return -ENOMEM;
+	}
+
 ssetup_ntlmssp_authenticate:
 	if (phase == NtLmChallenge)
 		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
-- 
cgit v1.2.3


From a4118ee1d80b527c385cadd14db79559efb8a493 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Oct 2010 11:00:08 -0400
Subject: a couple of open-coded ihold() introduced by nfs merge

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/unlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 9a16bad5d2ea..7bdec8531400 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -444,9 +444,9 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 
 	/* set up nfs_renamedata */
 	data->old_dir = old_dir;
-	atomic_inc(&old_dir->i_count);
+	ihold(old_dir);
 	data->new_dir = new_dir;
-	atomic_inc(&new_dir->i_count);
+	ihold(new_dir);
 	data->old_dentry = dget(old_dentry);
 	data->new_dentry = dget(new_dentry);
 	nfs_fattr_init(&data->old_fattr);
-- 
cgit v1.2.3


From d893f1bc2a9f0f7dcb4b433452c59f9bedac0d7d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Oct 2010 03:30:42 -0400
Subject: fix open/umount race

nameidata_to_filp() drops nd->path or transfers it to opened
file.  In the former case it's a Bad Idea(tm) to do mnt_drop_write()
on nd->path.mnt, since we might race with umount and vfsmount in
question might be gone already.

Fix: don't drop it, then...  IOW, have nameidata_to_filp() grab nd->path
in case it transfers it to file and do path_drop() in callers.  After
they are through with accessing nd->path...

Reported-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 2 ++
 fs/open.c  | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f7dbc06857ab..5362af9b7372 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1574,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd,
 	 */
 	if (will_truncate)
 		mnt_drop_write(nd->path.mnt);
+	path_put(&nd->path);
 	return filp;
 
 exit:
@@ -1675,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		filp = nameidata_to_filp(nd);
 		mnt_drop_write(nd->path.mnt);
+		path_put(&nd->path);
 		if (!IS_ERR(filp)) {
 			error = ima_file_check(filp, acc_mode);
 			if (error) {
diff --git a/fs/open.c b/fs/open.c
index d74e1983e8dc..4197b9ed023d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
 	/* Pick up the filp from the open intent */
 	filp = nd->intent.open.file;
 	/* Has the filesystem initialised the file for us? */
-	if (filp->f_path.dentry == NULL)
+	if (filp->f_path.dentry == NULL) {
+		path_get(&nd->path);
 		filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
 				     NULL, cred);
-	else
-		path_put(&nd->path);
+	}
 	return filp;
 }
 
-- 
cgit v1.2.3


From c96e41e92b4aaf11e1f9775ecf0d1c8cbff829ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 00:17:56 +0400
Subject: beginning of transtion: ->mount()

eventual replacement for ->get_sb() - does *not* get vfsmount,
return ERR_PTR(error) or root of subtree to be mounted.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 17 ++++++++++++++---
 include/linux/fs.h |  2 ++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index b9c9869165db..00a2c9662b55 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -918,6 +918,7 @@ struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
 	struct vfsmount *mnt;
+	struct dentry *root;
 	char *secdata = NULL;
 	int error;
 
@@ -942,9 +943,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 			goto out_free_secdata;
 	}
 
-	error = type->get_sb(type, flags, name, data, mnt);
-	if (error < 0)
-		goto out_free_secdata;
+	if (type->mount) {
+		root = type->mount(type, flags, name, data);
+		if (IS_ERR(root)) {
+			error = PTR_ERR(root);
+			goto out_free_secdata;
+		}
+		mnt->mnt_root = root;
+		mnt->mnt_sb = root->d_sb;
+	} else {
+		error = type->get_sb(type, flags, name, data, mnt);
+		if (error < 0)
+			goto out_free_secdata;
+	}
 	BUG_ON(!mnt->mnt_sb);
 	WARN_ON(!mnt->mnt_sb->s_bdi);
 	mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1c73b50e81ff..c6b474311690 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1772,6 +1772,8 @@ struct file_system_type {
 	int fs_flags;
 	int (*get_sb) (struct file_system_type *, int,
 		       const char *, void *, struct vfsmount *);
+	struct dentry *(*mount) (struct file_system_type *, int,
+		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
-- 
cgit v1.2.3


From 152a08366671080f27b32e0c411ad620c5f88b57 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 00:46:55 +0400
Subject: new helper: mount_bdev()

... and switch of the obvious get_sb_bdev() users to ->mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/super.c              |  9 ++++-----
 fs/affs/super.c              |  9 ++++-----
 fs/befs/linuxvfs.c           | 11 +++++------
 fs/bfs/inode.c               |  8 ++++----
 fs/cramfs/inode.c            |  9 ++++-----
 fs/efs/super.c               |  8 ++++----
 fs/ext2/super.c              |  8 ++++----
 fs/ext3/super.c              |  8 ++++----
 fs/ext4/super.c              | 16 ++++++++--------
 fs/fat/namei_msdos.c         |  9 ++++-----
 fs/fat/namei_vfat.c          |  9 ++++-----
 fs/freevxfs/vxfs_super.c     |  9 ++++-----
 fs/fuse/inode.c              |  9 ++++-----
 fs/hfs/super.c               |  9 ++++-----
 fs/hfsplus/super.c           | 10 ++++------
 fs/hpfs/super.c              |  9 ++++-----
 fs/isofs/inode.c             |  9 ++++-----
 fs/jfs/super.c               |  9 ++++-----
 fs/minix/inode.c             |  9 ++++-----
 fs/ntfs/super.c              |  9 ++++-----
 fs/ocfs2/super.c             | 11 ++++-------
 fs/omfs/inode.c              |  9 ++++-----
 fs/qnx4/inode.c              |  9 ++++-----
 fs/reiserfs/super.c          |  9 ++++-----
 fs/squashfs/super.c          | 10 ++++------
 fs/super.c                   | 28 +++++++++++++++++++++-------
 fs/sysv/super.c              | 17 ++++++++---------
 fs/udf/super.c               |  9 ++++-----
 fs/ufs/super.c               |  8 ++++----
 fs/xfs/linux-2.6/xfs_super.c | 12 +++++-------
 include/linux/fs.h           |  3 +++
 31 files changed, 150 insertions(+), 161 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index d9803f73236f..959dbff2d42d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -490,17 +490,16 @@ error:
 	return -EINVAL;
 }
 
-static int adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
 }
 
 static struct file_system_type adfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "adfs",
-	.get_sb		= adfs_get_sb,
+	.mount		= adfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/affs/super.c b/fs/affs/super.c
index fa4fbe1e238a..0cf7f4384cbd 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -573,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *affs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
 }
 
 static struct file_system_type affs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "affs",
-	.get_sb		= affs_get_sb,
+	.mount		= affs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dc39d2824885..aa4e7c7ae3c6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int
-befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data, struct vfsmount *mnt)
+static struct dentry *
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
+	    void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
 }
 
 static struct file_system_type befs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "befs",
-	.get_sb		= befs_get_sb,
+	.mount		= befs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,	
 };
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 883e77acd5a8..76db6d7d49bb 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -450,16 +450,16 @@ out:
 	return ret;
 }
 
-static int bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
 }
 
 static struct file_system_type bfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bfs",
-	.get_sb		= bfs_get_sb,
+	.mount		= bfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 1e7a33028d33..32fd5fe9ca0e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -533,17 +533,16 @@ static const struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static int cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
 }
 
 static struct file_system_type cramfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "cramfs",
-	.get_sb		= cramfs_get_sb,
+	.mount		= cramfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f04942810818..5073a07652cc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static int efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *efs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
 
 static struct file_system_type efs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "efs",
-	.get_sb		= efs_get_sb,
+	.mount		= efs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0901320671da..d89e0b6a2d78 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1356,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	return 0;
 }
 
-static int ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
 }
 
 #ifdef CONFIG_QUOTA
@@ -1473,7 +1473,7 @@ out:
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
-	.get_sb		= ext2_get_sb,
+	.mount		= ext2_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index db87413d3479..2fedaf8b5012 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -3020,16 +3020,16 @@ out:
 
 #endif
 
-static int ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
 }
 
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
-	.get_sb		= ext3_get_sb,
+	.mount		= ext3_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0348ce066592..40131b777af6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -73,8 +73,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 
@@ -82,7 +82,7 @@ static void ext4_unregister_li_request(struct super_block *sb);
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
-	.get_sb		= ext4_get_sb,
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -4667,17 +4667,17 @@ out:
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
-	.get_sb		= ext4_get_sb,
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -4722,7 +4722,7 @@ static inline void unregister_as_ext3(void) { }
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
-	.get_sb		= ext4_get_sb,
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbca5c186ae7..3345aabd1dd7 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -675,18 +675,17 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int msdos_get_sb(struct file_system_type *fs_type,
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
 }
 
 static struct file_system_type msdos_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "msdos",
-	.get_sb		= msdos_get_sb,
+	.mount		= msdos_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6f0f6c9a0152..b936703b8924 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1071,18 +1071,17 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int vfat_get_sb(struct file_system_type *fs_type,
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
 		       int flags, const char *dev_name,
-		       void *data, struct vfsmount *mnt)
+		       void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
 }
 
 static struct file_system_type vfat_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "vfat",
-	.get_sb		= vfat_get_sb,
+	.mount		= vfat_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 71b0148b8784..9d1c99558389 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -246,17 +246,16 @@ out:
 /*
  * The usual module blurb.
  */
-static int vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
 }
 
 static struct file_system_type vxfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "vxfs",
-	.get_sb		= vxfs_get_sb,
+	.mount		= vxfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index da9e6e11374c..edf6a1843533 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1070,12 +1070,11 @@ static struct file_system_type fuse_fs_type = {
 };
 
 #ifdef CONFIG_BLOCK
-static int fuse_get_sb_blk(struct file_system_type *fs_type,
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
 			   int flags, const char *dev_name,
-			   void *raw_data, struct vfsmount *mnt)
+			   void *raw_data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
 }
 
 static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1094,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
-	.get_sb		= fuse_get_sb_blk,
+	.mount		= fuse_mount_blk,
 	.kill_sb	= fuse_kill_sb_blk,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 6ee1586f2334..4824c27cebb8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -441,17 +441,16 @@ bail:
 	return res;
 }
 
-static int hfs_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data,
-		      struct vfsmount *mnt)
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
 }
 
 static struct file_system_type hfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "hfs",
-	.get_sb		= hfs_get_sb,
+	.mount		= hfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a88d7536103..52cc746d3ba3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -495,18 +495,16 @@ static void hfsplus_destroy_inode(struct inode *inode)
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static int hfsplus_get_sb(struct file_system_type *fs_type,
-			  int flags, const char *dev_name, void *data,
-			  struct vfsmount *mnt)
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
 }
 
 static struct file_system_type hfsplus_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "hfsplus",
-	.get_sb		= hfsplus_get_sb,
+	.mount		= hfsplus_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index c969a1aa163a..bb69389972eb 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -686,17 +686,16 @@ bail0:
 	return -EINVAL;
 }
 
-static int hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
 }
 
 static struct file_system_type hpfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "hpfs",
-	.get_sb		= hpfs_get_sb,
+	.mount		= hpfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 79cf7f616bbe..bfdeb82a53be 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1507,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static int isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
-				mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
 }
 
 static struct file_system_type iso9660_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "iso9660",
-	.get_sb		= isofs_get_sb,
+	.mount		= isofs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 68eee2bf629e..0669fc1cc3bf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -583,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb)
 	return 0;
 }
 
-static int jfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -770,7 +769,7 @@ static const struct export_operations jfs_export_operations = {
 static struct file_system_type jfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "jfs",
-	.get_sb		= jfs_get_sb,
+	.mount		= jfs_do_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e39d6bf2e8fb..fb2020858a34 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -614,17 +614,16 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static int minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *minix_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
 }
 
 static struct file_system_type minix_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "minix",
-	.get_sb		= minix_get_sb,
+	.mount		= minix_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d3fbe5730bfc..a30ecacc01f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3059,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static int ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
 }
 
 static struct file_system_type ntfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ntfs",
-	.get_sb		= ntfs_get_sb,
+	.mount		= ntfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f0cb395820..f02c0ef31578 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1236,14 +1236,12 @@ read_super_error:
 	return status;
 }
 
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
 			int flags,
 			const char *dev_name,
-			void *data,
-			struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
 }
 
 static void ocfs2_kill_sb(struct super_block *sb)
@@ -1267,8 +1265,7 @@ out:
 static struct file_system_type ocfs2_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ocfs2",
-	.get_sb         = ocfs2_get_sb, /* is this called when we mount
-					* the fs? */
+	.mount          = ocfs2_mount,
 	.kill_sb        = ocfs2_kill_sb,
 
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 14a22863291a..e043c4cb9a97 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -557,17 +557,16 @@ end:
 	return ret;
 }
 
-static int omfs_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *m)
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+	return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
 }
 
 static struct file_system_type omfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "omfs",
-	.get_sb = omfs_get_sb,
+	.mount = omfs_mount,
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 01bad30026fc..fcada42f1aa3 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -454,17 +454,16 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
-static int qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
 }
 
 static struct file_system_type qnx4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "qnx4",
-	.get_sb		= qnx4_get_sb,
+	.mount		= qnx4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e15ff612002d..3bf7a6457f4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2213,12 +2213,11 @@ out:
 
 #endif
 
-static int get_super_block(struct file_system_type *fs_type,
+static struct dentry *get_super_block(struct file_system_type *fs_type,
 			   int flags, const char *dev_name,
-			   void *data, struct vfsmount *mnt)
+			   void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
 }
 
 static int __init init_reiserfs_fs(void)
@@ -2253,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void)
 struct file_system_type reiserfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "reiserfs",
-	.get_sb = get_super_block,
+	.mount = get_super_block,
 	.kill_sb = reiserfs_kill_sb,
 	.fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 07a4f1156048..24de30ba34c1 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -370,12 +370,10 @@ static void squashfs_put_super(struct super_block *sb)
 }
 
 
-static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
-				const char *dev_name, void *data,
-				struct vfsmount *mnt)
+static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
+				const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
-				mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
 }
 
 
@@ -451,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode)
 static struct file_system_type squashfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "squashfs",
-	.get_sb = squashfs_get_sb,
+	.mount = squashfs_mount,
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV
 };
diff --git a/fs/super.c b/fs/super.c
index 00a2c9662b55..40989e9a2606 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -762,10 +762,9 @@ static int test_bdev_super(struct super_block *s, void *data)
 	return (void *)s->s_bdev == data;
 }
 
-int get_sb_bdev(struct file_system_type *fs_type,
+struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -777,7 +776,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+		return ERR_CAST(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -829,15 +828,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		bdev->bd_super = s;
 	}
 
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
 	close_bdev_exclusive(bdev, mode);
 error:
-	return error;
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+
+int get_sb_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct dentry *root;
+
+	root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a0b0cda6927e..3d9c62be0c10 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -526,23 +526,22 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static int sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
 }
 
-static int v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *v7_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
 }
 
 static struct file_system_type sysv_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "sysv",
-	.get_sb		= sysv_get_sb,
+	.mount		= sysv_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
 static struct file_system_type v7_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "v7",
-	.get_sb		= v7_get_sb,
+	.mount		= v7_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 76f3d6d97b40..4a5c7c61836a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 }
 
 /* UDF filesystem type */
-static int udf_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data,
-		      struct vfsmount *mnt)
+static struct dentry *udf_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
 }
 
 static struct file_system_type udf_fstype = {
 	.owner		= THIS_MODULE,
 	.name		= "udf",
-	.get_sb		= udf_get_sb,
+	.mount		= udf_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6b9be90dae7d..2c47daed56da 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1454,16 +1454,16 @@ static const struct super_operations ufs_super_ops = {
 	.show_options   = ufs_show_options,
 };
 
-static int ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
 }
 
 static struct file_system_type ufs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ufs",
-	.get_sb		= ufs_get_sb,
+	.mount		= ufs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index cf808782c065..9f3a78fe6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1609,16 +1609,14 @@ xfs_fs_fill_super(
 	goto out_free_sb;
 }
 
-STATIC int
-xfs_fs_get_sb(
+STATIC struct dentry *
+xfs_fs_mount(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data,
-	struct vfsmount		*mnt)
+	void			*data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
 }
 
 static const struct super_operations xfs_super_operations = {
@@ -1639,7 +1637,7 @@ static const struct super_operations xfs_super_operations = {
 static struct file_system_type xfs_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= "xfs",
-	.get_sb			= xfs_fs_get_sb,
+	.mount			= xfs_fs_mount,
 	.kill_sb		= kill_block_super,
 	.fs_flags		= FS_REQUIRES_DEV,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c6b474311690..2fab5a24ca51 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1792,6 +1792,9 @@ struct file_system_type {
 extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
+extern struct dentry *mount_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int));
 extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
-- 
cgit v1.2.3


From 848b83a59b772b8f102bc5e3f1187c2fa5676959 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 00:56:46 +0400
Subject: convert get_sb_mtd() users to ->mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/mtdsuper.c    | 54 +++++++++++++++++++----------------------------
 fs/jffs2/super.c          |  9 ++++----
 fs/romfs/super.c          | 17 +++++++--------
 include/linux/mtd/super.h |  5 ++---
 4 files changed, 36 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 38e2ab07e7a3..16b02a1fc100 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -54,11 +54,10 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 /*
  * get a superblock on an MTD-backed filesystem
  */
-static int get_sb_mtd_aux(struct file_system_type *fs_type, int flags,
+static struct dentry *mount_mtd_aux(struct file_system_type *fs_type, int flags,
 			  const char *dev_name, void *data,
 			  struct mtd_info *mtd,
-			  int (*fill_super)(struct super_block *, void *, int),
-			  struct vfsmount *mnt)
+			  int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *sb;
 	int ret;
@@ -79,57 +78,49 @@ static int get_sb_mtd_aux(struct file_system_type *fs_type, int flags,
 	ret = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 	if (ret < 0) {
 		deactivate_locked_super(sb);
-		return ret;
+		return ERR_PTR(ret);
 	}
 
 	/* go */
 	sb->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, sb);
-
-	return 0;
+	return dget(sb->s_root);
 
 	/* new mountpoint for an already mounted superblock */
 already_mounted:
 	DEBUG(1, "MTDSB: Device %d (\"%s\") is already mounted\n",
 	      mtd->index, mtd->name);
-	simple_set_mnt(mnt, sb);
-	ret = 0;
-	goto out_put;
+	put_mtd_device(mtd);
+	return dget(sb->s_root);
 
 out_error:
-	ret = PTR_ERR(sb);
-out_put:
 	put_mtd_device(mtd);
-	return ret;
+	return ERR_CAST(sb);
 }
 
 /*
  * get a superblock on an MTD-backed filesystem by MTD device number
  */
-static int get_sb_mtd_nr(struct file_system_type *fs_type, int flags,
+static struct dentry *mount_mtd_nr(struct file_system_type *fs_type, int flags,
 			 const char *dev_name, void *data, int mtdnr,
-			 int (*fill_super)(struct super_block *, void *, int),
-			 struct vfsmount *mnt)
+			 int (*fill_super)(struct super_block *, void *, int))
 {
 	struct mtd_info *mtd;
 
 	mtd = get_mtd_device(NULL, mtdnr);
 	if (IS_ERR(mtd)) {
 		DEBUG(0, "MTDSB: Device #%u doesn't appear to exist\n", mtdnr);
-		return PTR_ERR(mtd);
+		return ERR_CAST(mtd);
 	}
 
-	return get_sb_mtd_aux(fs_type, flags, dev_name, data, mtd, fill_super,
-			      mnt);
+	return mount_mtd_aux(fs_type, flags, dev_name, data, mtd, fill_super);
 }
 
 /*
  * set up an MTD-based superblock
  */
-int get_sb_mtd(struct file_system_type *fs_type, int flags,
+struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
 	       const char *dev_name, void *data,
-	       int (*fill_super)(struct super_block *, void *, int),
-	       struct vfsmount *mnt)
+	       int (*fill_super)(struct super_block *, void *, int))
 {
 #ifdef CONFIG_BLOCK
 	struct block_device *bdev;
@@ -138,7 +129,7 @@ int get_sb_mtd(struct file_system_type *fs_type, int flags,
 	int mtdnr;
 
 	if (!dev_name)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	DEBUG(2, "MTDSB: dev_name \"%s\"\n", dev_name);
 
@@ -156,10 +147,10 @@ int get_sb_mtd(struct file_system_type *fs_type, int flags,
 
 			mtd = get_mtd_device_nm(dev_name + 4);
 			if (!IS_ERR(mtd))
-				return get_sb_mtd_aux(
+				return mount_mtd_aux(
 					fs_type, flags,
 					dev_name, data, mtd,
-					fill_super, mnt);
+					fill_super);
 
 			printk(KERN_NOTICE "MTD:"
 			       " MTD device with name \"%s\" not found.\n",
@@ -174,9 +165,9 @@ int get_sb_mtd(struct file_system_type *fs_type, int flags,
 				/* It was a valid number */
 				DEBUG(1, "MTDSB: mtd%%d, mtdnr %d\n",
 				      mtdnr);
-				return get_sb_mtd_nr(fs_type, flags,
+				return mount_mtd_nr(fs_type, flags,
 						     dev_name, data,
-						     mtdnr, fill_super, mnt);
+						     mtdnr, fill_super);
 			}
 		}
 	}
@@ -189,7 +180,7 @@ int get_sb_mtd(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
 		DEBUG(1, "MTDSB: lookup_bdev() returned %d\n", ret);
-		return ret;
+		return ERR_PTR(ret);
 	}
 	DEBUG(1, "MTDSB: lookup_bdev() returned 0\n");
 
@@ -202,8 +193,7 @@ int get_sb_mtd(struct file_system_type *fs_type, int flags,
 	if (major != MTD_BLOCK_MAJOR)
 		goto not_an_MTD_device;
 
-	return get_sb_mtd_nr(fs_type, flags, dev_name, data, mtdnr, fill_super,
-			     mnt);
+	return mount_mtd_nr(fs_type, flags, dev_name, data, mtdnr, fill_super);
 
 not_an_MTD_device:
 #endif /* CONFIG_BLOCK */
@@ -212,10 +202,10 @@ not_an_MTD_device:
 		printk(KERN_NOTICE
 		       "MTD: Attempt to mount non-MTD device \"%s\"\n",
 		       dev_name);
-	return -EINVAL;
+	return ERR_PTR(-EINVAL);
 }
 
-EXPORT_SYMBOL_GPL(get_sb_mtd);
+EXPORT_SYMBOL_GPL(mount_mtd);
 
 /*
  * destroy an MTD-based superblock
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d1ae5dfc22b9..c86041b866a4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -179,12 +179,11 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 	return ret;
 }
 
-static int jffs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super,
-			  mnt);
+	return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
 }
 
 static void jffs2_put_super (struct super_block *sb)
@@ -229,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb)
 static struct file_system_type jffs2_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"jffs2",
-	.get_sb =	jffs2_get_sb,
+	.mount =	jffs2_mount,
 	.kill_sb =	jffs2_kill_sb,
 };
 
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268580535c92..6647f90e55cd 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -552,20 +552,19 @@ error_rsb:
 /*
  * get a superblock for mounting
  */
-static int romfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	int ret = -EINVAL;
+	struct dentry *ret = ERR_PTR(-EINVAL);
 
 #ifdef CONFIG_ROMFS_ON_MTD
-	ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
-			 mnt);
+	ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
-	if (ret == -EINVAL)
-		ret = get_sb_bdev(fs_type, flags, dev_name, data,
-				  romfs_fill_super, mnt);
+	if (ret == ERR_PTR(-EINVAL))
+		ret = mount_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super);
 #endif
 	return ret;
 }
@@ -592,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb)
 static struct file_system_type romfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "romfs",
-	.get_sb		= romfs_get_sb,
+	.mount		= romfs_mount,
 	.kill_sb	= romfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/include/linux/mtd/super.h b/include/linux/mtd/super.h
index 4016dd6fe336..f456230f9330 100644
--- a/include/linux/mtd/super.h
+++ b/include/linux/mtd/super.h
@@ -18,10 +18,9 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 
-extern int get_sb_mtd(struct file_system_type *fs_type, int flags,
+extern struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
 		      const char *dev_name, void *data,
-		      int (*fill_super)(struct super_block *, void *, int),
-		      struct vfsmount *mnt);
+		      int (*fill_super)(struct super_block *, void *, int));
 extern void kill_mtd_super(struct super_block *sb);
 
 
-- 
cgit v1.2.3


From fc14f2fef682df677d64a145256dbd263df2aa7b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 01:48:30 +0400
Subject: convert get_sb_single() users

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 10 +++++-----
 arch/s390/hypfs/inode.c                   |  8 ++++----
 drivers/base/devtmpfs.c                   | 18 ++++++++---------
 drivers/infiniband/hw/ipath/ipath_fs.c    | 14 +++++++-------
 drivers/infiniband/hw/qib/qib_fs.c        | 14 +++++++-------
 drivers/isdn/capi/capifs.c                |  8 ++++----
 drivers/misc/ibmasm/ibmasmfs.c            |  9 ++++-----
 drivers/oprofile/oprofilefs.c             |  8 ++++----
 drivers/usb/core/inode.c                  |  8 ++++----
 drivers/usb/gadget/f_fs.c                 | 14 +++++++-------
 drivers/usb/gadget/inode.c                | 10 +++++-----
 drivers/xen/xenfs/super.c                 |  8 ++++----
 fs/binfmt_misc.c                          |  8 ++++----
 fs/configfs/mount.c                       |  8 ++++----
 fs/debugfs/inode.c                        |  8 ++++----
 fs/devpts/inode.c                         | 32 +++++++++++++++----------------
 fs/fuse/control.c                         | 10 ++++------
 fs/nfsd/nfsctl.c                          |  8 ++++----
 fs/openpromfs/inode.c                     |  8 ++++----
 fs/super.c                                | 25 ++++++++++++++++++------
 include/linux/fs.h                        |  3 +++
 net/sunrpc/rpc_pipe.c                     | 18 ++++++++---------
 security/inode.c                          |  8 ++++----
 security/selinux/selinuxfs.c              |  9 ++++-----
 security/smack/smackfs.c                  | 12 +++++-------
 25 files changed, 147 insertions(+), 139 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 5dec408d6703..3532b92de983 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -798,17 +798,17 @@ spufs_fill_super(struct super_block *sb, void *data, int silent)
 	return spufs_create_root(sb, data);
 }
 
-static int
-spufs_get_sb(struct file_system_type *fstype, int flags,
-		const char *name, void *data, struct vfsmount *mnt)
+static struct dentry *
+spufs_mount(struct file_system_type *fstype, int flags,
+		const char *name, void *data)
 {
-	return get_sb_single(fstype, flags, data, spufs_fill_super, mnt);
+	return mount_single(fstype, flags, data, spufs_fill_super);
 }
 
 static struct file_system_type spufs_type = {
 	.owner = THIS_MODULE,
 	.name = "spufs",
-	.get_sb = spufs_get_sb,
+	.mount = spufs_mount,
 	.kill_sb = kill_litter_super,
 };
 
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 74d98670be27..47cc446dab8f 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -316,10 +316,10 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int hypfs_get_super(struct file_system_type *fst, int flags,
-			const char *devname, void *data, struct vfsmount *mnt)
+static struct dentry *hypfs_mount(struct file_system_type *fst, int flags,
+			const char *devname, void *data)
 {
-	return get_sb_single(fst, flags, data, hypfs_fill_super, mnt);
+	return mount_single(fst, flags, data, hypfs_fill_super);
 }
 
 static void hypfs_kill_super(struct super_block *sb)
@@ -455,7 +455,7 @@ static const struct file_operations hypfs_file_ops = {
 static struct file_system_type hypfs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "s390_hypfs",
-	.get_sb		= hypfs_get_super,
+	.mount		= hypfs_mount,
 	.kill_sb	= hypfs_kill_super
 };
 
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index af0600143d1c..82bbb5967aa9 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -29,33 +29,33 @@
 static struct vfsmount *dev_mnt;
 
 #if defined CONFIG_DEVTMPFS_MOUNT
-static int dev_mount = 1;
+static int mount_dev = 1;
 #else
-static int dev_mount;
+static int mount_dev;
 #endif
 
 static DEFINE_MUTEX(dirlock);
 
 static int __init mount_param(char *str)
 {
-	dev_mount = simple_strtoul(str, NULL, 0);
+	mount_dev = simple_strtoul(str, NULL, 0);
 	return 1;
 }
 __setup("devtmpfs.mount=", mount_param);
 
-static int dev_get_sb(struct file_system_type *fs_type, int flags,
-		      const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *dev_mount(struct file_system_type *fs_type, int flags,
+		      const char *dev_name, void *data)
 {
 #ifdef CONFIG_TMPFS
-	return get_sb_single(fs_type, flags, data, shmem_fill_super, mnt);
+	return mount_single(fs_type, flags, data, shmem_fill_super);
 #else
-	return get_sb_single(fs_type, flags, data, ramfs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, ramfs_fill_super);
 #endif
 }
 
 static struct file_system_type dev_fs_type = {
 	.name = "devtmpfs",
-	.get_sb = dev_get_sb,
+	.mount = dev_mount,
 	.kill_sb = kill_litter_super,
 };
 
@@ -351,7 +351,7 @@ int devtmpfs_mount(const char *mntdir)
 {
 	int err;
 
-	if (!dev_mount)
+	if (!mount_dev)
 		return 0;
 
 	if (!dev_mnt)
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 12d5bf76302c..8c8afc716b98 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -362,13 +362,13 @@ bail:
 	return ret;
 }
 
-static int ipathfs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
 {
-	int ret = get_sb_single(fs_type, flags, data,
-				    ipathfs_fill_super, mnt);
-	if (ret >= 0)
-		ipath_super = mnt->mnt_sb;
+	struct dentry *ret;
+	ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
+	if (!IS_ERR(ret))
+		ipath_super = ret->d_sb;
 	return ret;
 }
 
@@ -411,7 +411,7 @@ bail:
 static struct file_system_type ipathfs_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"ipathfs",
-	.get_sb =	ipathfs_get_sb,
+	.mount =	ipathfs_mount,
 	.kill_sb =	ipathfs_kill_super,
 };
 
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 7e433d75c775..f99bddc01716 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -555,13 +555,13 @@ bail:
 	return ret;
 }
 
-static int qibfs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data)
 {
-	int ret = get_sb_single(fs_type, flags, data,
-				qibfs_fill_super, mnt);
-	if (ret >= 0)
-		qib_super = mnt->mnt_sb;
+	struct dentry *ret;
+	ret = mount_single(fs_type, flags, data, qibfs_fill_super);
+	if (!IS_ERR(ret))
+		qib_super = ret->d_sb;
 	return ret;
 }
 
@@ -603,7 +603,7 @@ int qibfs_remove(struct qib_devdata *dd)
 static struct file_system_type qibfs_fs_type = {
 	.owner =        THIS_MODULE,
 	.name =         "ipathfs",
-	.get_sb =       qibfs_get_sb,
+	.mount =        qibfs_mount,
 	.kill_sb =      qibfs_kill_super,
 };
 
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 2b83850997c3..b4faed7fe0d3 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -125,16 +125,16 @@ fail:
 	return -ENOMEM;
 }
 
-static int capifs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *capifs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, capifs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, capifs_fill_super);
 }
 
 static struct file_system_type capifs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "capifs",
-	.get_sb		= capifs_get_sb,
+	.mount		= capifs_mount,
 	.kill_sb	= kill_anon_super,
 };
 
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 0a53500636c9..d2d5d23416dd 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -91,11 +91,10 @@ static void ibmasmfs_create_files (struct super_block *sb, struct dentry *root);
 static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent);
 
 
-static int ibmasmfs_get_super(struct file_system_type *fst,
-			int flags, const char *name, void *data,
-			struct vfsmount *mnt)
+static struct dentry *ibmasmfs_mount(struct file_system_type *fst,
+			int flags, const char *name, void *data)
 {
-	return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt);
+	return mount_single(fst, flags, data, ibmasmfs_fill_super);
 }
 
 static const struct super_operations ibmasmfs_s_ops = {
@@ -108,7 +107,7 @@ static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
 static struct file_system_type ibmasmfs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ibmasmfs",
-	.get_sb         = ibmasmfs_get_super,
+	.mount          = ibmasmfs_mount,
 	.kill_sb        = kill_litter_super,
 };
 
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 449de59bf35b..e9ff6f7770be 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -259,17 +259,17 @@ static int oprofilefs_fill_super(struct super_block *sb, void *data, int silent)
 }
 
 
-static int oprofilefs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *oprofilefs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, oprofilefs_fill_super);
 }
 
 
 static struct file_system_type oprofilefs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "oprofilefs",
-	.get_sb		= oprofilefs_get_sb,
+	.mount		= oprofilefs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index e2f63c0ea09d..9819a4cc3b26 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -574,16 +574,16 @@ static void fs_remove_file (struct dentry *dentry)
 
 /* --------------------------------------------------------------------- */
 
-static int usb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *usb_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, usbfs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, usbfs_fill_super);
 }
 
 static struct file_system_type usb_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"usbfs",
-	.get_sb =	usb_get_sb,
+	.mount =	usb_mount,
 	.kill_sb =	kill_litter_super,
 };
 
diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c
index f276e9594f00..4a830df4fc31 100644
--- a/drivers/usb/gadget/f_fs.c
+++ b/drivers/usb/gadget/f_fs.c
@@ -1176,9 +1176,9 @@ invalid:
 
 /* "mount -t functionfs dev_name /dev/function" ends up here */
 
-static int
-ffs_fs_get_sb(struct file_system_type *t, int flags,
-	      const char *dev_name, void *opts, struct vfsmount *mnt)
+static struct dentry *
+ffs_fs_mount(struct file_system_type *t, int flags,
+	      const char *dev_name, void *opts)
 {
 	struct ffs_sb_fill_data data = {
 		.perms = {
@@ -1194,14 +1194,14 @@ ffs_fs_get_sb(struct file_system_type *t, int flags,
 
 	ret = functionfs_check_dev_callback(dev_name);
 	if (unlikely(ret < 0))
-		return ret;
+		return ERR_PTR(ret);
 
 	ret = ffs_fs_parse_opts(&data, opts);
 	if (unlikely(ret < 0))
-		return ret;
+		return ERR_PTR(ret);
 
 	data.dev_name = dev_name;
-	return get_sb_single(t, flags, &data, ffs_sb_fill, mnt);
+	return mount_single(t, flags, &data, ffs_sb_fill);
 }
 
 static void
@@ -1220,7 +1220,7 @@ ffs_fs_kill_sb(struct super_block *sb)
 static struct file_system_type ffs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "functionfs",
-	.get_sb		= ffs_fs_get_sb,
+	.mount		= ffs_fs_mount,
 	.kill_sb	= ffs_fs_kill_sb,
 };
 
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index ba145e7fbe03..3ed73f49cf18 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -2097,11 +2097,11 @@ enomem0:
 }
 
 /* "mount -t gadgetfs path /dev/gadget" ends up here */
-static int
-gadgetfs_get_sb (struct file_system_type *t, int flags,
-		const char *path, void *opts, struct vfsmount *mnt)
+static struct dentry *
+gadgetfs_mount (struct file_system_type *t, int flags,
+		const char *path, void *opts)
 {
-	return get_sb_single (t, flags, opts, gadgetfs_fill_super, mnt);
+	return mount_single (t, flags, opts, gadgetfs_fill_super);
 }
 
 static void
@@ -2119,7 +2119,7 @@ gadgetfs_kill_sb (struct super_block *sb)
 static struct file_system_type gadgetfs_type = {
 	.owner		= THIS_MODULE,
 	.name		= shortname,
-	.get_sb		= gadgetfs_get_sb,
+	.mount		= gadgetfs_mount,
 	.kill_sb	= gadgetfs_kill_sb,
 };
 
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index d6662b789b6b..f6339d11d59c 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -121,17 +121,17 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 	return rc;
 }
 
-static int xenfs_get_sb(struct file_system_type *fs_type,
+static int xenfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, xenfs_fill_super);
 }
 
 static struct file_system_type xenfs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"xenfs",
-	.get_sb =	xenfs_get_sb,
+	.mount =	xenfs_mount,
 	.kill_sb =	kill_litter_super,
 };
 
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 29990f0eee0c..1befe2ec8186 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -706,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static int bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *bm_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
+	return mount_single(fs_type, flags, data, bm_fill_super);
 }
 
 static struct linux_binfmt misc_format = {
@@ -720,7 +720,7 @@ static struct linux_binfmt misc_format = {
 static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
-	.get_sb		= bm_get_sb,
+	.mount		= bm_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2d..7d3607febe1c 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, configfs_fill_super);
 }
 
 static struct file_system_type configfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "configfs",
-	.get_sb		= configfs_get_sb,
+	.mount		= configfs_do_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a4ed8380e98a..37a8ca7c1222 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -135,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static int debug_get_sb(struct file_system_type *fs_type,
+static struct dentry *debug_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
+	return mount_single(fs_type, flags, data, debug_fill_super);
 }
 
 static struct file_system_type debug_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"debugfs",
-	.get_sb =	debug_get_sb,
+	.mount =	debug_mount,
 	.kill_sb =	kill_litter_super,
 };
 
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b5235..1bb547c9cad6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 
 /*
- * devpts_get_sb()
+ * devpts_mount()
  *
  *     If the '-o newinstance' mount option was specified, mount a new
  *     (private) instance of devpts.  PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
  *     semantics in devpts while preserving backward compatibility of the
  *     current 'single-namespace' semantics. i.e all mounts of devpts
  *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like get_sb_single().
+ *     kernel mount, like mount_single().
  *
  *     Mounts with 'newinstance' option create a new, private namespace.
  *
  *     NOTE:
  *
- *     For single-mount semantics, devpts cannot use get_sb_single(),
- *     because get_sb_single()/sget() find and use the super-block from
+ *     For single-mount semantics, devpts cannot use mount_single(),
+ *     because mount_single()/sget() find and use the super-block from
  *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     'newinstance' mount and mount_single() would pick the newinstance
  *     super-block instead of the initial super-block.
  */
-static int devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
 	int error;
 	struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 
 	error = parse_mount_options(data, PARSE_MOUNT, &opts);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 
 	if (opts.newinstance)
 		s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 		s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 
 	if (!s->s_root) {
 		s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 	if (error)
 		goto out_undo_sget;
 
-	simple_set_mnt(mnt, s);
-
-	return 0;
+	return dget(s->s_root);
 
 out_undo_sget:
 	deactivate_locked_super(s);
-	return error;
+	return ERR_PTR(error);
 }
 
 #else
@@ -404,10 +402,10 @@ out_undo_sget:
  * This supports only the legacy single-instance semantics (no
  * multiple-instance semantics)
  */
-static int devpts_get_sb(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+	return mount_single(fs_type, flags, data, devpts_fill_super);
 }
 #endif
 
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
 
 static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
-	.get_sb		= devpts_get_sb,
+	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
 };
 
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4eba07661e5c..85542a7daf40 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -322,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *raw_data,
-			struct vfsmount *mnt)
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *raw_data)
 {
-	return get_sb_single(fs_type, flags, raw_data,
-				fuse_ctl_fill_super, mnt);
+	return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
 }
 
 static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -346,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
 static struct file_system_type fuse_ctl_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fusectl",
-	.get_sb		= fuse_ctl_get_sb,
+	.mount		= fuse_ctl_mount,
 	.kill_sb	= fuse_ctl_kill_sb,
 };
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index d6dc3f61f8ba..4514ebbee4d6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1405,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static int nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
+	return mount_single(fs_type, flags, data, nfsd_fill_super);
 }
 
 static struct file_system_type nfsd_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfsd",
-	.get_sb		= nfsd_get_sb,
+	.mount		= nfsd_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012c..ddb1f41376e5 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -415,16 +415,16 @@ out_no_root:
 	return ret;
 }
 
-static int openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
+	return mount_single(fs_type, flags, data, openprom_fill_super)
 }
 
 static struct file_system_type openprom_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "openpromfs",
-	.get_sb		= openprom_get_sb,
+	.mount		= openprom_mount,
 	.kill_sb	= kill_anon_super,
 };
 
diff --git a/fs/super.c b/fs/super.c
index 40989e9a2606..6f021a171ac6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -900,29 +900,42 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *mount_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
-			return error;
+			return ERR_PTR(error);
 		}
 		s->s_flags |= MS_ACTIVE;
 	} else {
 		do_remount_sb(s, flags, data, 0);
 	}
-	simple_set_mnt(mnt, s);
+	return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+
+int get_sb_single(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct dentry *root;
+	root = mount_single(fs_type, flags, data, fill_super);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
 	return 0;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2fab5a24ca51..0aa2f1202afa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1799,6 +1799,9 @@ extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
+extern struct dentry *mount_single(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int));
 extern int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 7df92d237cb8..10a17a37ec4e 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -28,7 +28,7 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/cache.h>
 
-static struct vfsmount *rpc_mount __read_mostly;
+static struct vfsmount *rpc_mnt __read_mostly;
 static int rpc_mount_count;
 
 static struct file_system_type rpc_pipe_fs_type;
@@ -417,16 +417,16 @@ struct vfsmount *rpc_get_mount(void)
 {
 	int err;
 
-	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mount, &rpc_mount_count);
+	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mnt, &rpc_mount_count);
 	if (err != 0)
 		return ERR_PTR(err);
-	return rpc_mount;
+	return rpc_mnt;
 }
 EXPORT_SYMBOL_GPL(rpc_get_mount);
 
 void rpc_put_mount(void)
 {
-	simple_release_fs(&rpc_mount, &rpc_mount_count);
+	simple_release_fs(&rpc_mnt, &rpc_mount_count);
 }
 EXPORT_SYMBOL_GPL(rpc_put_mount);
 
@@ -1018,17 +1018,17 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int
-rpc_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *
+rpc_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt);
+	return mount_single(fs_type, flags, data, rpc_fill_super);
 }
 
 static struct file_system_type rpc_pipe_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "rpc_pipefs",
-	.get_sb		= rpc_get_sb,
+	.mount		= rpc_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/security/inode.c b/security/inode.c
index cb8f47c66a58..c4df2fbebe6b 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -131,17 +131,17 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, SECURITYFS_MAGIC, files);
 }
 
-static int get_sb(struct file_system_type *fs_type,
+static struct dentry *get_sb(struct file_system_type *fs_type,
 		  int flags, const char *dev_name,
-		  void *data, struct vfsmount *mnt)
+		  void *data)
 {
-	return get_sb_single(fs_type, flags, data, fill_super, mnt);
+	return mount_single(fs_type, flags, data, fill_super);
 }
 
 static struct file_system_type fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"securityfs",
-	.get_sb =	get_sb,
+	.mount =	get_sb,
 	.kill_sb =	kill_litter_super,
 };
 
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 55a755c1a1bd..073fd5b0a53a 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1909,16 +1909,15 @@ err:
 	goto out;
 }
 
-static int sel_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data,
-		      struct vfsmount *mnt)
+static struct dentry *sel_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, sel_fill_super, mnt);
+	return mount_single(fs_type, flags, data, sel_fill_super);
 }
 
 static struct file_system_type sel_fs_type = {
 	.name		= "selinuxfs",
-	.get_sb		= sel_get_sb,
+	.mount		= sel_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 7512502d0162..dc1fd6239f24 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -1310,27 +1310,25 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
 }
 
 /**
- * smk_get_sb - get the smackfs superblock
+ * smk_mount - get the smackfs superblock
  * @fs_type: passed along without comment
  * @flags: passed along without comment
  * @dev_name: passed along without comment
  * @data: passed along without comment
- * @mnt: passed along without comment
  *
  * Just passes everything along.
  *
  * Returns what the lower level code does.
  */
-static int smk_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data,
-		      struct vfsmount *mnt)
+static struct dentry *smk_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, smk_fill_super, mnt);
+	return mount_single(fs_type, flags, data, smk_fill_super);
 }
 
 static struct file_system_type smk_fs_type = {
 	.name		= "smackfs",
-	.get_sb		= smk_get_sb,
+	.mount		= smk_mount,
 	.kill_sb	= kill_litter_super,
 };
 
-- 
cgit v1.2.3


From 3c26ff6e499ee7e6f9f2bc7da5f2f30d80862ecf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 11:46:36 +0400
Subject: convert get_sb_nodev() users

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/autofs/init.c    |  8 ++++----
 drivers/staging/pohmelfs/inode.c |  9 ++++-----
 drivers/staging/smbfs/inode.c    |  8 ++++----
 fs/autofs4/init.c                |  8 ++++----
 fs/coda/inode.c                  |  8 ++++----
 fs/exofs/super.c                 | 10 +++++-----
 fs/fuse/inode.c                  |  8 ++++----
 fs/hostfs/hostfs_kern.c          |  8 ++++----
 fs/hppfs/hppfs.c                 |  8 ++++----
 fs/hugetlbfs/inode.c             |  8 ++++----
 fs/ncpfs/inode.c                 |  8 ++++----
 fs/ocfs2/dlmfs/dlmfs.c           |  8 ++++----
 fs/ramfs/inode.c                 | 17 ++++++++---------
 fs/super.c                       | 27 ++++++++++++++++++++-------
 include/linux/fs.h               |  3 +++
 include/linux/ramfs.h            |  4 ++--
 mm/shmem.c                       | 10 +++++-----
 17 files changed, 87 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/autofs/init.c b/drivers/staging/autofs/init.c
index 765c72f42976..5e4b372ea663 100644
--- a/drivers/staging/autofs/init.c
+++ b/drivers/staging/autofs/init.c
@@ -14,16 +14,16 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static int autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, autofs_fill_super);
 }
 
 static struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
-	.get_sb		= autofs_get_sb,
+	.mount		= autofs_mount,
 	.kill_sb	= autofs_kill_sb,
 };
 
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index c62d30017c07..61685ccceda8 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1937,11 +1937,10 @@ err_out_exit:
 /*
  * Some VFS magic here...
  */
-static int pohmelfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *pohmelfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, pohmelfs_fill_super,
-				mnt);
+	return mount_nodev(fs_type, flags, data, pohmelfs_fill_super);
 }
 
 /*
@@ -1958,7 +1957,7 @@ static void pohmelfs_kill_super(struct super_block *sb)
 static struct file_system_type pohmel_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "pohmel",
-	.get_sb		= pohmelfs_get_sb,
+	.mount		= pohmelfs_mount,
 	.kill_sb 	= pohmelfs_kill_super,
 };
 
diff --git a/drivers/staging/smbfs/inode.c b/drivers/staging/smbfs/inode.c
index 552951aa7498..f9c493591ce7 100644
--- a/drivers/staging/smbfs/inode.c
+++ b/drivers/staging/smbfs/inode.c
@@ -793,16 +793,16 @@ out:
 	return error;
 }
 
-static int smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *smb_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, smb_fill_super);
 }
 
 static struct file_system_type smb_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "smbfs",
-	.get_sb		= smb_get_sb,
+	.mount		= smb_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd8957..c038727b4050 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static int autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, autofs4_fill_super);
 }
 
 static struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
-	.get_sb		= autofs_get_sb,
+	.mount		= autofs_mount,
 	.kill_sb	= autofs4_kill_sb,
 };
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 7993b96ca348..5ea57c8c7f97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -306,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static int coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *coda_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, coda_fill_super);
 }
 
 struct file_system_type coda_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "coda",
-	.get_sb		= coda_get_sb,
+	.mount		= coda_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 047e92fa3af8..79c3ae6e0456 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -659,19 +659,19 @@ free_bdi:
 /*
  * Set up the superblock (calls exofs_fill_super eventually)
  */
-static int exofs_get_sb(struct file_system_type *type,
+static struct dentry *exofs_mount(struct file_system_type *type,
 			  int flags, const char *dev_name,
-			  void *data, struct vfsmount *mnt)
+			  void *data)
 {
 	struct exofs_mountopt opts;
 	int ret;
 
 	ret = parse_options(data, &opts);
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 
 	opts.dev_name = dev_name;
-	return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+	return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
 
 /*
@@ -809,7 +809,7 @@ static const struct export_operations exofs_export_ops = {
 static struct file_system_type exofs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "exofs",
-	.get_sb         = exofs_get_sb,
+	.mount          = exofs_mount,
 	.kill_sb        = generic_shutdown_super,
 };
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index edf6a1843533..cfce3ad86a92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1041,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	return err;
 }
 
-static int fuse_get_sb(struct file_system_type *fs_type,
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
 		       int flags, const char *dev_name,
-		       void *raw_data, struct vfsmount *mnt)
+		       void *raw_data)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
 }
 
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1065,7 +1065,7 @@ static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
 	.fs_flags	= FS_HAS_SUBTYPE,
-	.get_sb		= fuse_get_sb,
+	.mount		= fuse_mount,
 	.kill_sb	= fuse_kill_sb_anon,
 };
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index cd7c93917cc7..2c0f148a49e6 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -962,11 +962,11 @@ out:
 	return err;
 }
 
-static int hostfs_read_sb(struct file_system_type *type,
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
 			  int flags, const char *dev_name,
-			  void *data, struct vfsmount *mnt)
+			  void *data)
 {
-	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
+	return mount_nodev(type, flags, data, hostfs_fill_sb_common);
 }
 
 static void hostfs_kill_sb(struct super_block *s)
@@ -978,7 +978,7 @@ static void hostfs_kill_sb(struct super_block *s)
 static struct file_system_type hostfs_type = {
 	.owner 		= THIS_MODULE,
 	.name 		= "hostfs",
-	.get_sb 	= hostfs_read_sb,
+	.mount	 	= hostfs_read_sb,
 	.kill_sb	= hostfs_kill_sb,
 	.fs_flags 	= 0,
 };
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 4e2a45ea6140..f702b5f713fc 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -748,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static int hppfs_read_super(struct file_system_type *type,
+static struct dentry *hppfs_read_super(struct file_system_type *type,
 			    int flags, const char *dev_name,
-			    void *data, struct vfsmount *mnt)
+			    void *data)
 {
-	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
+	return mount_nodev(type, flags, data, hppfs_fill_super);
 }
 
 static struct file_system_type hppfs_type = {
 	.owner 		= THIS_MODULE,
 	.name 		= "hppfs",
-	.get_sb 	= hppfs_read_super,
+	.mount 		= hppfs_read_super,
 	.kill_sb	= kill_anon_super,
 	.fs_flags 	= 0,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b14be3f781c7..d6cfac1f0a40 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -896,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
 	}
 }
 
-static int hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
 	.name		= "hugetlbfs",
-	.get_sb		= hugetlbfs_get_sb,
+	.mount		= hugetlbfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 985fabb26aca..d290545aa0c4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -1020,16 +1020,16 @@ out:
 	return result;
 }
 
-static int ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, ncp_fill_super);
 }
 
 static struct file_system_type ncp_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ncpfs",
-	.get_sb		= ncp_get_sb,
+	.mount		= ncp_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 75e115f1bd73..b2df490a19ed 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -643,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
 	.setattr	= dlmfs_file_setattr,
 };
 
-static int dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 }
 
 static struct file_system_type dlmfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ocfs2_dlmfs",
-	.get_sb		= dlmfs_get_sb,
+	.mount		= dlmfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 67fadb1ad2c1..eacb166fb259 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -255,17 +255,16 @@ fail:
 	return err;
 }
 
-int ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
 
-static int rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
-			    mnt);
+	return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
 }
 
 static void ramfs_kill_sb(struct super_block *sb)
@@ -276,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
 
 static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
-	.get_sb		= ramfs_get_sb,
+	.mount		= ramfs_mount,
 	.kill_sb	= ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
-	.get_sb		= rootfs_get_sb,
+	.mount		= rootfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/super.c b/fs/super.c
index 6f021a171ac6..f6a7bf1fff2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -870,29 +870,42 @@ void kill_block_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_block_super);
 #endif
 
-int get_sb_nodev(struct file_system_type *fs_type,
+struct dentry *mount_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 
 	s->s_flags = flags;
 
 	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		deactivate_locked_super(s);
-		return error;
+		return ERR_PTR(error);
 	}
 	s->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 }
+EXPORT_SYMBOL(mount_nodev);
+
+int get_sb_nodev(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct dentry *root;
 
+	root = mount_nodev(fs_type, flags, data, fill_super);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	return 0;
+}
 EXPORT_SYMBOL(get_sb_nodev);
 
 static int compare_single(struct super_block *s, void *p)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0aa2f1202afa..4c3a29ddcacb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1806,6 +1806,9 @@ extern int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt);
+extern struct dentry *mount_nodev(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int));
 extern int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int),
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index e7320b5e82fb..3a8f0c9b2933 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -3,8 +3,8 @@
 
 struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir,
 	 int mode, dev_t dev);
-extern int ramfs_get_sb(struct file_system_type *fs_type,
-	 int flags, const char *dev_name, void *data, struct vfsmount *mnt);
+extern struct dentry *ramfs_mount(struct file_system_type *fs_type,
+	 int flags, const char *dev_name, void *data);
 
 #ifndef CONFIG_MMU
 extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
diff --git a/mm/shmem.c b/mm/shmem.c
index f6d350e8adc5..47fdeeb9d636 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2538,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-static int shmem_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
 
 static struct file_system_type tmpfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "tmpfs",
-	.get_sb		= shmem_get_sb,
+	.mount		= shmem_mount,
 	.kill_sb	= kill_litter_super,
 };
 
@@ -2643,7 +2643,7 @@ out:
 
 static struct file_system_type tmpfs_fs_type = {
 	.name		= "tmpfs",
-	.get_sb		= ramfs_get_sb,
+	.mount		= ramfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
-- 
cgit v1.2.3


From 51139adac92f7160ad3ca1cab2de1b4b8d19dc96 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 23:47:46 +0400
Subject: convert get_sb_pseudo() users

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c |  9 ++++-----
 drivers/mtd/mtdchar.c      | 10 ++++------
 fs/anon_inodes.c           | 10 ++++------
 fs/block_dev.c             |  8 ++++----
 fs/libfs.c                 | 14 ++++++--------
 fs/pipe.c                  |  9 ++++-----
 include/linux/fs.h         |  5 ++---
 net/socket.c               | 10 ++++------
 8 files changed, 32 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 6b1852f7f972..39e534f5a3b0 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -618,16 +618,15 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 
 
-static int
-pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
-	     struct vfsmount *mnt)
+static struct dentry *
+pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
+	return mount_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
 }
 
 static struct file_system_type pfm_fs_type = {
 	.name     = "pfmfs",
-	.get_sb   = pfmfs_get_sb,
+	.mount    = pfmfs_mount,
 	.kill_sb  = kill_anon_super,
 };
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 5ef45487b65f..a34a0fe14884 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1030,17 +1030,15 @@ static const struct file_operations mtd_fops = {
 #endif
 };
 
-static int mtd_inodefs_get_sb(struct file_system_type *fs_type, int flags,
-                               const char *dev_name, void *data,
-                               struct vfsmount *mnt)
+static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
 {
-        return get_sb_pseudo(fs_type, "mtd_inode:", NULL, MTD_INODE_FS_MAGIC,
-                             mnt);
+        return mount_pseudo(fs_type, "mtd_inode:", NULL, MTD_INODE_FS_MAGIC);
 }
 
 static struct file_system_type mtd_inodefs_type = {
        .name = "mtd_inodefs",
-       .get_sb = mtd_inodefs_get_sb,
+       .mount = mtd_inodefs_mount,
        .kill_sb = kill_anon_super,
 };
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 5365527ca43f..57ce55b2564c 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
 
-static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
-			       const char *dev_name, void *data,
-			       struct vfsmount *mnt)
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
-			     mnt);
+	return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
 }
 
 /*
@@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 
 static struct file_system_type anon_inode_fs_type = {
 	.name		= "anon_inodefs",
-	.get_sb		= anon_inodefs_get_sb,
+	.mount		= anon_inodefs_mount,
 	.kill_sb	= kill_anon_super,
 };
 static const struct dentry_operations anon_inodefs_dentry_operations = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index dea3b628a6ce..06e8ff12b97c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -464,15 +464,15 @@ static const struct super_operations bdev_sops = {
 	.evict_inode = bdev_evict_inode,
 };
 
-static int bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *bd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
 }
 
 static struct file_system_type bd_type = {
 	.name		= "bdev",
-	.get_sb		= bd_get_sb,
+	.mount		= bd_mount,
 	.kill_sb	= kill_anon_super,
 };
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 304a5132ca27..a3accdf528ad 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-int get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	const struct super_operations *ops, unsigned long magic,
-	struct vfsmount *mnt)
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
+	const struct super_operations *ops, unsigned long magic)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	struct dentry *dentry;
@@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 
 Enomem:
 	deactivate_locked_super(s);
-	return -ENOMEM;
+	return ERR_PTR(-ENOMEM);
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -951,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
-EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(mount_pseudo);
 EXPORT_SYMBOL(simple_write_begin);
 EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/pipe.c b/fs/pipe.c
index d2d7566ce68e..a8012a955720 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1247,16 +1247,15 @@ out:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-static int pipefs_get_sb(struct file_system_type *fs_type,
-			 int flags, const char *dev_name, void *data,
-			 struct vfsmount *mnt)
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
+	return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
 }
 
 static struct file_system_type pipe_fs_type = {
 	.name		= "pipefs",
-	.get_sb		= pipefs_get_sb,
+	.mount		= pipefs_mount,
 	.kill_sb	= kill_anon_super,
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4c3a29ddcacb..43e6cfb5cbb3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1824,9 +1824,8 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			void *data);
-extern int get_sb_pseudo(struct file_system_type *, char *,
-	const struct super_operations *ops, unsigned long,
-	struct vfsmount *mnt);
+extern struct dentry *mount_pseudo(struct file_system_type *, char *,
+	const struct super_operations *ops, unsigned long);
 extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 
 static inline void sb_mark_dirty(struct super_block *sb)
diff --git a/net/socket.c b/net/socket.c
index ee3cd280c76e..5247ae10f374 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -305,19 +305,17 @@ static const struct super_operations sockfs_ops = {
 	.statfs		= simple_statfs,
 };
 
-static int sockfs_get_sb(struct file_system_type *fs_type,
-			 int flags, const char *dev_name, void *data,
-			 struct vfsmount *mnt)
+static struct dentry *sockfs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
-			     mnt);
+	return mount_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
 }
 
 static struct vfsmount *sock_mnt __read_mostly;
 
 static struct file_system_type sock_fs_type = {
 	.name =		"sockfs",
-	.get_sb =	sockfs_get_sb,
+	.mount =	sockfs_mount,
 	.kill_sb =	kill_anon_super,
 };
 
-- 
cgit v1.2.3


From 157d81e7ffe04f2c97c3580e185787c2d29463bb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 23:52:42 +0400
Subject: convert ubifs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ubifs/super.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9a47c9f0ad07..91fac54c70e3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2038,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data)
 	return c->vi.cdev == *dev;
 }
 
-static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *name, void *data, struct vfsmount *mnt)
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
+			const char *name, void *data)
 {
 	struct ubi_volume_desc *ubi;
 	struct ubi_volume_info vi;
@@ -2057,7 +2057,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(ubi)) {
 		dbg_err("cannot open \"%s\", error %d",
 			name, (int)PTR_ERR(ubi));
-		return PTR_ERR(ubi);
+		return ERR_CAST(ubi);
 	}
 	ubi_get_volume_info(ubi, &vi);
 
@@ -2095,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	/* 'fill_super()' opens ubi again so we must close it here */
 	ubi_close_volume(ubi);
 
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 
 out_deact:
 	deactivate_locked_super(sb);
 out_close:
 	ubi_close_volume(ubi);
-	return err;
+	return ERR_PTR(err);
 }
 
 static struct file_system_type ubifs_fs_type = {
 	.name    = "ubifs",
 	.owner   = THIS_MODULE,
-	.get_sb  = ubifs_get_sb,
+	.mount   = ubifs_mount,
 	.kill_sb = kill_anon_super,
 };
 
-- 
cgit v1.2.3


From d2d1ea93069bd7706206b9c124e438ab2795612c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 25 Jul 2010 23:56:43 +0400
Subject: convert v9fs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_super.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48d4215c60a8..c55c614500ad 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -68,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
  * v9fs_fill_super - populate superblock with info
  * @sb: superblock
  * @v9ses: session information
- * @flags: flags propagated from v9fs_get_sb()
+ * @flags: flags propagated from v9fs_mount()
  *
  */
 
@@ -99,18 +99,16 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 }
 
 /**
- * v9fs_get_sb - mount a superblock
+ * v9fs_mount - mount a superblock
  * @fs_type: file system type
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
- * @mnt: mountpoint record to be instantiated
  *
  */
 
-static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data,
-		       struct vfsmount *mnt)
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
 {
 	struct super_block *sb = NULL;
 	struct inode *inode = NULL;
@@ -124,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	fid = v9fs_session_init(v9ses, dev_name, data);
 	if (IS_ERR(fid)) {
@@ -186,15 +184,15 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	v9fs_fid_add(root, fid);
 
 	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 
 clunk_fid:
 	p9_client_clunk(fid);
 close_session:
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
-	return retval;
+	return ERR_PTR(retval);
+
 release_sb:
 	/*
 	 * we will do the session_close and root dentry release
@@ -204,7 +202,7 @@ release_sb:
 	 */
 	p9_client_clunk(fid);
 	deactivate_locked_super(sb);
-	return retval;
+	return ERR_PTR(retval);
 }
 
 /**
@@ -300,7 +298,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 
 struct file_system_type v9fs_fs_type = {
 	.name = "9p",
-	.get_sb = v9fs_get_sb,
+	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
 	.fs_flags = FS_RENAME_DOES_D_MOVE,
-- 
cgit v1.2.3


From 71a1c0125f132b2a4656689ca585c5d8931e539c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 11:25:05 +0400
Subject: logfs get_sb massage, part 1

move allocation of logfs_super to logfs_get_sb, pass it to
logfs_get_sb_...().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/dev_bdev.c | 12 ++++++++----
 fs/logfs/dev_mtd.c  |  9 ++++++---
 fs/logfs/logfs.h    | 17 ++++++++++++-----
 fs/logfs/super.c    | 22 +++++++++++++---------
 4 files changed, 39 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a3040..bca8e2a8e55b 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/gfp.h>
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -320,20 +321,23 @@ static const struct logfs_device_ops bd_devops = {
 	.put_device	= bdev_put_device,
 };
 
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *p,
+		struct file_system_type *type, int flags,
 		const char *devname, struct vfsmount *mnt)
 {
 	struct block_device *bdev;
 
 	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
-	if (IS_ERR(bdev))
+	if (IS_ERR(bdev)) {
+		kfree(p);
 		return PTR_ERR(bdev);
+	}
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
 		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
-		return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+		return logfs_get_sb_mtd(p, type, flags, mtdnr, mnt);
 	}
 
-	return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+	return logfs_get_sb_device(p, type, flags, NULL, bdev, &bd_devops, mnt);
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4b..e886cdcf9d7e 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -265,14 +265,17 @@ static const struct logfs_device_ops mtd_devops = {
 	.put_device	= mtd_put_device,
 };
 
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s,
+		struct file_system_type *type, int flags,
 		int mtdnr, struct vfsmount *mnt)
 {
 	struct mtd_info *mtd;
 	const struct logfs_device_ops *devops = &mtd_devops;
 
 	mtd = get_mtd_device(NULL, mtdnr);
-	if (IS_ERR(mtd))
+	if (IS_ERR(mtd)) {
+		kfree(s);
 		return PTR_ERR(mtd);
-	return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+	}
+	return logfs_get_sb_device(s, type, flags, mtd, NULL, devops, mnt);
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b8786264d243..a3f0aba9e526 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -471,24 +471,30 @@ void logfs_compr_exit(void);
 
 /* dev_bdev.c */
 #ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type, int flags,
 		const char *devname, struct vfsmount *mnt);
 #else
-static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type, int flags,
 		const char *devname, struct vfsmount *mnt)
 {
+	kfree(s);
 	return -ENODEV;
 }
 #endif
 
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s,
+		struct file_system_type *type, int flags,
 		int mtdnr, struct vfsmount *mnt);
 #else
-static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_mtd(struct logfs_super *s,
+		struct file_system_type *type, int flags,
 		int mtdnr, struct vfsmount *mnt)
 {
+	kfree(s);
 	return -ENODEV;
 }
 #endif
@@ -619,7 +625,8 @@ void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
 void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_get_sb_device(struct file_system_type *type, int flags,
+int logfs_get_sb_device(struct logfs_super *s,
+		struct file_system_type *type, int flags,
 		struct mtd_info *mtd, struct block_device *bdev,
 		const struct logfs_device_ops *devops, struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5336155c5d81..5e43178add98 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -536,19 +536,16 @@ static void logfs_kill_sb(struct super_block *sb)
 	log_super("LogFS: Finished unmounting\n");
 }
 
-int logfs_get_sb_device(struct file_system_type *type, int flags,
+int logfs_get_sb_device(struct logfs_super *super,
+		struct file_system_type *type, int flags,
 		struct mtd_info *mtd, struct block_device *bdev,
 		const struct logfs_device_ops *devops, struct vfsmount *mnt)
 {
-	struct logfs_super *super;
 	struct super_block *sb;
 	int err = -ENOMEM;
 	static int mount_count;
 
 	log_super("LogFS: Start mount %x\n", mount_count++);
-	super = kzalloc(sizeof(*super), GFP_KERNEL);
-	if (!super)
-		goto err0;
 
 	super->s_mtd	= mtd;
 	super->s_bdev	= bdev;
@@ -603,20 +600,27 @@ static int logfs_get_sb(struct file_system_type *type, int flags,
 		const char *devname, void *data, struct vfsmount *mnt)
 {
 	ulong mtdnr;
+	struct logfs_super *super;
+
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		return -ENOMEM;
 
 	if (!devname)
-		return logfs_get_sb_bdev(type, flags, devname, mnt);
+		return logfs_get_sb_bdev(super, type, flags, devname, mnt);
 	if (strncmp(devname, "mtd", 3))
-		return logfs_get_sb_bdev(type, flags, devname, mnt);
+		return logfs_get_sb_bdev(super, type, flags, devname, mnt);
 
 	{
 		char *garbage;
 		mtdnr = simple_strtoul(devname+3, &garbage, 0);
-		if (*garbage)
+		if (*garbage) {
+			kfree(super);
 			return -EINVAL;
+		}
 	}
 
-	return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+	return logfs_get_sb_mtd(super, type, flags, mtdnr, mnt);
 }
 
 static struct file_system_type logfs_fs_type = {
-- 
cgit v1.2.3


From 0d85c799623cb6022adb1317ed2987ab9c097c2e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 11:33:39 +0400
Subject: logfs get_sb, part 2

take setting s_bdev/s_mtd/s_devops to callers of logfs_get_sb_device(),
don't bother passing them separately

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/dev_bdev.c |  6 +++++-
 fs/logfs/dev_mtd.c  | 11 ++++++-----
 fs/logfs/logfs.h    |  3 +--
 fs/logfs/super.c    |  7 +------
 4 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index bca8e2a8e55b..a322fec10173 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -339,5 +339,9 @@ int logfs_get_sb_bdev(struct logfs_super *p,
 		return logfs_get_sb_mtd(p, type, flags, mtdnr, mnt);
 	}
 
-	return logfs_get_sb_device(p, type, flags, NULL, bdev, &bd_devops, mnt);
+	p->s_bdev = bdev;
+	p->s_mtd = NULL;
+	p->s_devops = &bd_devops;
+
+	return logfs_get_sb_device(p, type, flags, mnt);
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index e886cdcf9d7e..7b22d57d1469 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -269,13 +269,14 @@ int logfs_get_sb_mtd(struct logfs_super *s,
 		struct file_system_type *type, int flags,
 		int mtdnr, struct vfsmount *mnt)
 {
-	struct mtd_info *mtd;
-	const struct logfs_device_ops *devops = &mtd_devops;
-
-	mtd = get_mtd_device(NULL, mtdnr);
+	struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
 	if (IS_ERR(mtd)) {
 		kfree(s);
 		return PTR_ERR(mtd);
 	}
-	return logfs_get_sb_device(s, type, flags, mtd, NULL, devops, mnt);
+
+	s->s_bdev = NULL;
+	s->s_mtd = mtd;
+	s->s_devops = &mtd_devops;
+	return logfs_get_sb_device(s, type, flags, mnt);
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index a3f0aba9e526..bdd56fa4b084 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -627,8 +627,7 @@ void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
 int logfs_get_sb_device(struct logfs_super *s,
 		struct file_system_type *type, int flags,
-		struct mtd_info *mtd, struct block_device *bdev,
-		const struct logfs_device_ops *devops, struct vfsmount *mnt);
+		struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
 
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5e43178add98..c80837e54bb3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -538,8 +538,7 @@ static void logfs_kill_sb(struct super_block *sb)
 
 int logfs_get_sb_device(struct logfs_super *super,
 		struct file_system_type *type, int flags,
-		struct mtd_info *mtd, struct block_device *bdev,
-		const struct logfs_device_ops *devops, struct vfsmount *mnt)
+		struct vfsmount *mnt)
 {
 	struct super_block *sb;
 	int err = -ENOMEM;
@@ -547,8 +546,6 @@ int logfs_get_sb_device(struct logfs_super *super,
 
 	log_super("LogFS: Start mount %x\n", mount_count++);
 
-	super->s_mtd	= mtd;
-	super->s_bdev	= bdev;
 	err = -EINVAL;
 	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
 	if (IS_ERR(sb))
@@ -561,8 +558,6 @@ int logfs_get_sb_device(struct logfs_super *super,
 		goto err0;
 	}
 
-	super->s_devops = devops;
-
 	/*
 	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
 	 * only covers 16TB and the upper 8TB are used for indirect blocks.
-- 
cgit v1.2.3


From 7d945a3aa7608f68dba04083d3421e0b43052660 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 11:53:30 +0400
Subject: logfs get_sb, part 3

take logfs_get_sb_device() calls to logfs_get_sb() itself

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/dev_bdev.c | 15 +++++----------
 fs/logfs/dev_mtd.c  | 10 +++-------
 fs/logfs/logfs.h    | 18 ++++++------------
 fs/logfs/super.c    | 25 +++++++++++++++----------
 4 files changed, 29 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a322fec10173..223b673dcc7b 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,7 +9,6 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
-#include <linux/slab.h>
 #include <linux/gfp.h>
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -321,27 +320,23 @@ static const struct logfs_device_ops bd_devops = {
 	.put_device	= bdev_put_device,
 };
 
-int logfs_get_sb_bdev(struct logfs_super *p,
-		struct file_system_type *type, int flags,
-		const char *devname, struct vfsmount *mnt)
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
+		const char *devname)
 {
 	struct block_device *bdev;
 
 	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
-	if (IS_ERR(bdev)) {
-		kfree(p);
+	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-	}
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
 		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
-		return logfs_get_sb_mtd(p, type, flags, mtdnr, mnt);
+		return logfs_get_sb_mtd(p, mtdnr);
 	}
 
 	p->s_bdev = bdev;
 	p->s_mtd = NULL;
 	p->s_devops = &bd_devops;
-
-	return logfs_get_sb_device(p, type, flags, mnt);
+	return 0;
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 7b22d57d1469..9e3dbe3712ae 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -265,18 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
 	.put_device	= mtd_put_device,
 };
 
-int logfs_get_sb_mtd(struct logfs_super *s,
-		struct file_system_type *type, int flags,
-		int mtdnr, struct vfsmount *mnt)
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 {
 	struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
-	if (IS_ERR(mtd)) {
-		kfree(s);
+	if (IS_ERR(mtd))
 		return PTR_ERR(mtd);
-	}
 
 	s->s_bdev = NULL;
 	s->s_mtd = mtd;
 	s->s_devops = &mtd_devops;
-	return logfs_get_sb_device(s, type, flags, mnt);
+	return 0;
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index bdd56fa4b084..5d2e66b48290 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -472,29 +472,23 @@ void logfs_compr_exit(void);
 /* dev_bdev.c */
 #ifdef CONFIG_BLOCK
 int logfs_get_sb_bdev(struct logfs_super *s,
-		struct file_system_type *type, int flags,
-		const char *devname, struct vfsmount *mnt);
+		struct file_system_type *type,
+		const char *devname);
 #else
 static inline int logfs_get_sb_bdev(struct logfs_super *s,
-		struct file_system_type *type, int flags,
-		const char *devname, struct vfsmount *mnt)
+		struct file_system_type *type,
+		const char *devname)
 {
-	kfree(s);
 	return -ENODEV;
 }
 #endif
 
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct logfs_super *s,
-		struct file_system_type *type, int flags,
-		int mtdnr, struct vfsmount *mnt);
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 #else
-static inline int logfs_get_sb_mtd(struct logfs_super *s,
-		struct file_system_type *type, int flags,
-		int mtdnr, struct vfsmount *mnt)
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 {
-	kfree(s);
 	return -ENODEV;
 }
 #endif
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c80837e54bb3..f57a150b4779 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -596,26 +596,31 @@ static int logfs_get_sb(struct file_system_type *type, int flags,
 {
 	ulong mtdnr;
 	struct logfs_super *super;
+	int err;
 
 	super = kzalloc(sizeof(*super), GFP_KERNEL);
 	if (!super)
 		return -ENOMEM;
 
 	if (!devname)
-		return logfs_get_sb_bdev(super, type, flags, devname, mnt);
-	if (strncmp(devname, "mtd", 3))
-		return logfs_get_sb_bdev(super, type, flags, devname, mnt);
-
-	{
+		err = logfs_get_sb_bdev(super, type, devname);
+	else if (strncmp(devname, "mtd", 3))
+		err = logfs_get_sb_bdev(super, type, devname);
+	else {
 		char *garbage;
 		mtdnr = simple_strtoul(devname+3, &garbage, 0);
-		if (*garbage) {
-			kfree(super);
-			return -EINVAL;
-		}
+		if (*garbage)
+			err = -EINVAL;
+		else
+			err = logfs_get_sb_mtd(super, mtdnr);
+	}
+
+	if (err) {
+		kfree(super);
+		return err;
 	}
 
-	return logfs_get_sb_mtd(super, type, flags, mtdnr, mnt);
+	return logfs_get_sb_device(super, type, flags, mnt);
 }
 
 static struct file_system_type logfs_fs_type = {
-- 
cgit v1.2.3


From e5a0726a953daf224ae42bcf5edaa64f71b4e8a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 12:06:00 +0400
Subject: logfs: fix a leak in get_sb

a) switch ->put_device() to logfs_super *
b) actually call it on early failures in logfs_get_sb_device()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/dev_bdev.c | 4 ++--
 fs/logfs/dev_mtd.c  | 4 ++--
 fs/logfs/logfs.h    | 3 ++-
 fs/logfs/super.c    | 4 ++--
 4 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 223b673dcc7b..92ca6fbe09bd 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 	return sync_request(page, bdev, WRITE);
 }
 
-static void bdev_put_device(struct super_block *sb)
+static void bdev_put_device(struct logfs_super *s)
 {
-	close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+	close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 9e3dbe3712ae..7466e9dcc8c5 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
 	__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
 
-static void mtd_put_device(struct super_block *sb)
+static void mtd_put_device(struct logfs_super *s)
 {
-	put_mtd_device(logfs_super(sb)->s_mtd);
+	put_mtd_device(s->s_mtd);
 }
 
 static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5d2e66b48290..446c0f12d899 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
 	int	(*erase_segment)(struct logfs_area *area);
 };
 
+struct logfs_super;	/* forward */
 /**
  * struct logfs_device_ops - device access operations
  *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
 			int ensure_write);
 	int (*can_write_buf)(struct super_block *sb, u64 ofs);
 	void (*sync)(struct super_block *sb);
-	void (*put_device)(struct super_block *sb);
+	void (*put_device)(struct logfs_super *s);
 };
 
 /**
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f57a150b4779..f07d40e41c3f 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -529,7 +529,7 @@ static void logfs_kill_sb(struct super_block *sb)
 	logfs_cleanup_rw(sb);
 	if (super->s_erase_page)
 		__free_page(super->s_erase_page);
-	super->s_devops->put_device(sb);
+	super->s_devops->put_device(super);
 	logfs_mempool_destroy(super->s_btree_pool);
 	logfs_mempool_destroy(super->s_alias_pool);
 	kfree(super);
@@ -586,8 +586,8 @@ err1:
 	deactivate_locked_super(sb);
 	return err;
 err0:
+	super->s_devops->put_device(super);
 	kfree(super);
-	//devops->put_device(sb);
 	return err;
 }
 
-- 
cgit v1.2.3


From a1da9e8ab687e6496482b7b2aa17d0da31e55b20 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 12:14:03 +0400
Subject: switch logfs to ->mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/logfs.h |  3 ---
 fs/logfs/super.c | 47 +++++++++++++++++++++++------------------------
 2 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 446c0f12d899..cd51a36b37f0 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -620,9 +620,6 @@ void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
 void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_get_sb_device(struct logfs_super *s,
-		struct file_system_type *type, int flags,
-		struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
 
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f07d40e41c3f..33435e4b14d2 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
 	return 0;
 }
 
-static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+static int logfs_get_sb_final(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct inode *rootdir;
@@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
 	}
 
 	log_super("LogFS: Finished mounting\n");
-	simple_set_mnt(mnt, sb);
 	return 0;
 
 fail:
@@ -536,9 +535,8 @@ static void logfs_kill_sb(struct super_block *sb)
 	log_super("LogFS: Finished unmounting\n");
 }
 
-int logfs_get_sb_device(struct logfs_super *super,
-		struct file_system_type *type, int flags,
-		struct vfsmount *mnt)
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
+		struct file_system_type *type, int flags)
 {
 	struct super_block *sb;
 	int err = -ENOMEM;
@@ -548,14 +546,17 @@ int logfs_get_sb_device(struct logfs_super *super,
 
 	err = -EINVAL;
 	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
-	if (IS_ERR(sb))
-		goto err0;
+	if (IS_ERR(sb)) {
+		super->s_devops->put_device(super);
+		kfree(super);
+		return ERR_CAST(sb);
+	}
 
 	if (sb->s_root) {
 		/* Device is already in use */
-		err = 0;
-		simple_set_mnt(mnt, sb);
-		goto err0;
+		super->s_devops->put_device(super);
+		kfree(super);
+		return dget(sb->s_root);
 	}
 
 	/*
@@ -573,10 +574,12 @@ int logfs_get_sb_device(struct logfs_super *super,
 		goto err1;
 
 	sb->s_flags |= MS_ACTIVE;
-	err = logfs_get_sb_final(sb, mnt);
-	if (err)
+	err = logfs_get_sb_final(sb);
+	if (err) {
 		deactivate_locked_super(sb);
-	return err;
+		return ERR_PTR(err);
+	}
+	return dget(sb->s_root);
 
 err1:
 	/* no ->s_root, no ->put_super() */
@@ -584,15 +587,11 @@ err1:
 	iput(super->s_segfile_inode);
 	iput(super->s_mapping_inode);
 	deactivate_locked_super(sb);
-	return err;
-err0:
-	super->s_devops->put_device(super);
-	kfree(super);
-	return err;
+	return ERR_PTR(err);
 }
 
-static int logfs_get_sb(struct file_system_type *type, int flags,
-		const char *devname, void *data, struct vfsmount *mnt)
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
+		const char *devname, void *data)
 {
 	ulong mtdnr;
 	struct logfs_super *super;
@@ -600,7 +599,7 @@ static int logfs_get_sb(struct file_system_type *type, int flags,
 
 	super = kzalloc(sizeof(*super), GFP_KERNEL);
 	if (!super)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	if (!devname)
 		err = logfs_get_sb_bdev(super, type, devname);
@@ -617,16 +616,16 @@ static int logfs_get_sb(struct file_system_type *type, int flags,
 
 	if (err) {
 		kfree(super);
-		return err;
+		return ERR_PTR(err);
 	}
 
-	return logfs_get_sb_device(super, type, flags, mnt);
+	return logfs_get_sb_device(super, type, flags);
 }
 
 static struct file_system_type logfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "logfs",
-	.get_sb		= logfs_get_sb,
+	.mount		= logfs_mount,
 	.kill_sb	= logfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 
-- 
cgit v1.2.3


From e4c59d61e80529aebca4a3690b4378f2c6fc4e82 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 12:19:34 +0400
Subject: convert nilfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/super.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 35ae03c0db86..f804d41ec9d3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1141,9 +1141,9 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
 	return (void *)s->s_bdev == data;
 }
 
-static int
-nilfs_get_sb(struct file_system_type *fs_type, int flags,
-	     const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *
+nilfs_mount(struct file_system_type *fs_type, int flags,
+	     const char *dev_name, void *data)
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
@@ -1156,7 +1156,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
-		return PTR_ERR(sd.bdev);
+		return ERR_CAST(sd.bdev);
 
 	sd.cno = 0;
 	sd.flags = flags;
@@ -1235,9 +1235,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (!s_new)
 		close_bdev_exclusive(sd.bdev, mode);
 
-	mnt->mnt_sb = s;
-	mnt->mnt_root = root_dentry;
-	return 0;
+	return root_dentry;
 
  failed_super:
 	deactivate_locked_super(s);
@@ -1245,13 +1243,13 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
  failed:
 	if (!s_new)
 		close_bdev_exclusive(sd.bdev, mode);
-	return err;
+	return ERR_PTR(err);
 }
 
 struct file_system_type nilfs_fs_type = {
 	.owner    = THIS_MODULE,
 	.name     = "nilfs2",
-	.get_sb   = nilfs_get_sb,
+	.mount    = nilfs_mount,
 	.kill_sb  = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV,
 };
-- 
cgit v1.2.3


From d753ed975953a4e97a356bcd59ff146919f29235 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 12:52:33 +0400
Subject: convert cifs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/cifsfs.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 34371637f210..8bd5c2c243a5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -545,9 +545,9 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
 
-static int
-cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *
+cifs_do_mount(struct file_system_type *fs_type,
+	    int flags, const char *dev_name, void *data)
 {
 	int rc;
 	struct super_block *sb;
@@ -557,18 +557,17 @@ cifs_get_sb(struct file_system_type *fs_type,
 	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
 
 	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+		return ERR_CAST(sb);
 
 	sb->s_flags = flags;
 
 	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
 	if (rc) {
 		deactivate_locked_super(sb);
-		return rc;
+		return ERR_PTR(rc);
 	}
 	sb->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 }
 
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -634,7 +633,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "cifs",
-	.get_sb = cifs_get_sb,
+	.mount = cifs_do_mount,
 	.kill_sb = kill_anon_super,
 	/*  .fs_flags */
 };
-- 
cgit v1.2.3


From 579441a39bbbbc408acd5b228d63e76cff708fe6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 13:09:36 +0400
Subject: setting ->proc_mnt doesn't belong in proc_get_sb()

take that to kern_mount_data()-using callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/root.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 93d99b316325..03b4f6fe4984 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -79,7 +79,6 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		}
 
 		sb->s_flags |= MS_ACTIVE;
-		ns->proc_mnt = mnt;
 	}
 
 	simple_set_mnt(mnt, sb);
@@ -115,6 +114,7 @@ void __init proc_root_init(void)
 		return;
 	}
 
+	init_pid_ns.proc_mnt = proc_mnt;
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
@@ -213,6 +213,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
+	ns->proc_mnt = mnt;
 	return 0;
 }
 
-- 
cgit v1.2.3


From aed1d84f98738bcc1c605e1ff442de9890441315 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 13:12:54 +0400
Subject: switch procfs to ->mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/root.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03b4f6fe4984..ef9fa8e24ad6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
 	return set_anon_super(sb, NULL);
 }
 
-static int proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *proc_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
 	int err;
 	struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, ns);
 	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+		return ERR_CAST(sb);
 
 	if (!sb->s_root) {
 		sb->s_flags = flags;
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
-			return err;
+			return ERR_PTR(err);
 		}
 
 		ei = PROC_I(sb->s_root->d_inode);
@@ -81,8 +81,7 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 }
 
 static void proc_kill_sb(struct super_block *sb)
@@ -96,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
 
 static struct file_system_type proc_fs_type = {
 	.name		= "proc",
-	.get_sb		= proc_get_sb,
+	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
 };
 
-- 
cgit v1.2.3


From ceefda6931806972ecf550bd8231dce4a4178953 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 13:16:50 +0400
Subject: switch get_sb_ns() users

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 14 ++++++--------
 include/linux/fs.h |  5 ++---
 ipc/mqueue.c       |  8 ++++----
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index f6a7bf1fff2b..ca696155cd9a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -715,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data)
 	return set_anon_super(sb, NULL);
 }
 
-int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
+	void *data, int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *sb;
 
 	sb = sget(fs_type, ns_test_super, ns_set_super, data);
 	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+		return ERR_CAST(sb);
 
 	if (!sb->s_root) {
 		int err;
@@ -731,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
 		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (err) {
 			deactivate_locked_super(sb);
-			return err;
+			return ERR_PTR(err);
 		}
 
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 }
 
-EXPORT_SYMBOL(get_sb_ns);
+EXPORT_SYMBOL(mount_ns);
 
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 43e6cfb5cbb3..4d07902bc50c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1789,9 +1789,8 @@ struct file_system_type {
 	struct lock_class_key i_alloc_sem_key;
 };
 
-extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt);
+extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
+	void *data, int (*fill_super)(struct super_block *, void *, int));
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3a61ffefe884..035f4399edbc 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -211,13 +211,13 @@ out:
 	return error;
 }
 
-static int mqueue_get_sb(struct file_system_type *fs_type,
+static struct dentry *mqueue_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name,
-			 void *data, struct vfsmount *mnt)
+			 void *data)
 {
 	if (!(flags & MS_KERNMOUNT))
 		data = current->nsproxy->ipc_ns;
-	return get_sb_ns(fs_type, flags, data, mqueue_fill_super, mnt);
+	return mount_ns(fs_type, flags, data, mqueue_fill_super);
 }
 
 static void init_once(void *foo)
@@ -1232,7 +1232,7 @@ static const struct super_operations mqueue_super_ops = {
 
 static struct file_system_type mqueue_fs_type = {
 	.name = "mqueue",
-	.get_sb = mqueue_get_sb,
+	.mount = mqueue_mount,
 	.kill_sb = kill_litter_super,
 };
 
-- 
cgit v1.2.3


From d0e46f88b2f73828faf00d559c7e5b3ce9e39a4b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 13:30:36 +0400
Subject: convert sysfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysfs/mount.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f2af22574c50..266895783b47 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,7 @@
 #include "sysfs.h"
 
 
-static struct vfsmount *sysfs_mount;
+static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 
 static const struct super_operations sysfs_ops = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 	return error;
 }
 
-static int sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
 	struct sysfs_super_info *info;
 	enum kobj_ns_type type;
 	struct super_block *sb;
 	int error;
 
-	error = -ENOMEM;
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
-	if (IS_ERR(sb)) {
-		error = PTR_ERR(sb);
-		goto out;
-	}
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
 	if (!sb->s_root) {
 		sb->s_flags = flags;
 		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(sb);
-			goto out;
+			return ERR_PTR(error);
 		}
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	simple_set_mnt(mnt, sb);
-	error = 0;
-out:
-	return error;
+	return dget(sb->s_root);
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
 
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
-	.get_sb		= sysfs_get_sb,
+	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
 };
 
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
 
 	err = register_filesystem(&sysfs_fs_type);
 	if (!err) {
-		sysfs_mount = kern_mount(&sysfs_fs_type);
-		if (IS_ERR(sysfs_mount)) {
+		sysfs_mnt = kern_mount(&sysfs_fs_type);
+		if (IS_ERR(sysfs_mnt)) {
 			printk(KERN_ERR "sysfs: could not mount!\n");
-			err = PTR_ERR(sysfs_mount);
-			sysfs_mount = NULL;
+			err = PTR_ERR(sysfs_mnt);
+			sysfs_mnt = NULL;
 			unregister_filesystem(&sysfs_fs_type);
 			goto out_err;
 		}
-- 
cgit v1.2.3


From 4d143beb0429e8c9c5f1dc66c7ff8ee70dde45a4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 13:33:36 +0400
Subject: convert ecryptfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/main.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..8585934712d4 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -540,9 +540,8 @@ out:
  *                        ecryptfs_interpose to perform most of the linking
  * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
  */
-static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *raw_data,
-			struct vfsmount *mnt)
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data)
 {
 	struct super_block *s;
 	struct ecryptfs_sb_info *sbi;
@@ -607,8 +606,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 		err = "Reading sb failed";
 		goto out;
 	}
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 
 out:
 	if (sbi) {
@@ -616,7 +614,7 @@ out:
 		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
 	}
 	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
-	return rc;
+	return ERR_PTR(rc);
 }
 
 /**
@@ -639,7 +637,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
 static struct file_system_type ecryptfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "ecryptfs",
-	.get_sb = ecryptfs_get_sb,
+	.mount = ecryptfs_mount,
 	.kill_sb = ecryptfs_kill_block_super,
 	.fs_flags = 0
 };
-- 
cgit v1.2.3


From f7442b3be65bfcabbb5d6e896e65d69e8b261583 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 14:16:21 +0400
Subject: convert afs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/super.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/super.c b/fs/afs/super.c
index eacf76d98ae0..27201cffece4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -29,9 +29,8 @@
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
 static void afs_i_init_once(void *foo);
-static int afs_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name,
-		      void *data, struct vfsmount *mnt);
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data);
 static struct inode *afs_alloc_inode(struct super_block *sb);
 static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
@@ -40,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
-	.get_sb		= afs_get_sb,
+	.mount		= afs_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= 0,
 };
@@ -359,11 +358,8 @@ error:
 /*
  * get an AFS superblock
  */
-static int afs_get_sb(struct file_system_type *fs_type,
-		      int flags,
-		      const char *dev_name,
-		      void *options,
-		      struct vfsmount *mnt)
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *options)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -427,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
 	}
 
-	simple_set_mnt(mnt, sb);
 	afs_put_volume(params.volume);
 	afs_put_cell(params.cell);
 	kfree(new_opts);
 	_leave(" = 0 [%p]", sb);
-	return 0;
+	return dget(sb->s_root);
 
 error:
 	afs_put_volume(params.volume);
@@ -440,7 +435,7 @@ error:
 	key_put(params.key);
 	kfree(new_opts);
 	_leave(" = %d", ret);
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /*
-- 
cgit v1.2.3


From 8bcbbf0009dd467afd6bed1fedfcb1d2463f55a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 16:03:58 +0400
Subject: convert gfs2

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/ops_fstype.c | 51 ++++++++++++++++++++++-----------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index cade1acbcea9..3eb1393f7b81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1250,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 }
 
 /**
- * gfs2_get_sb - Get the GFS2 superblock
+ * gfs2_mount - Get the GFS2 superblock
  * @fs_type: The GFS2 filesystem type
  * @flags: Mount flags
  * @dev_name: The name of the device
  * @data: The mount arguments
- * @mnt: The vfsmnt for this mount
  *
  * Q. Why not use get_sb_bdev() ?
  * A. We need to select one of two root directories to mount, independent
@@ -1264,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
  * Returns: 0 or -ve on error
  */
 
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -1279,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 
 	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+		return ERR_CAST(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -1298,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(s))
 		goto error_bdev;
 
+	if (s->s_root)
+		close_bdev_exclusive(bdev, mode);
+
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
 	args.ar_data = GFS2_DATA_DEFAULT;
@@ -1309,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	error = gfs2_mount_args(&args, data);
 	if (error) {
 		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-		if (s->s_root)
-			goto error_super;
-		deactivate_locked_super(s);
-		return error;
+		goto error_super;
 	}
 
 	if (s->s_root) {
 		error = -EBUSY;
 		if ((flags ^ s->s_flags) & MS_RDONLY)
 			goto error_super;
-		close_bdev_exclusive(bdev, mode);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -1328,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return error;
-		}
+		if (error)
+			goto error_super;
 		s->s_flags |= MS_ACTIVE;
 		bdev->bd_super = s;
 	}
 
 	sdp = s->s_fs_info;
-	mnt->mnt_sb = s;
 	if (args.ar_meta)
-		mnt->mnt_root = dget(sdp->sd_master_dir);
+		return dget(sdp->sd_master_dir);
 	else
-		mnt->mnt_root = dget(sdp->sd_root_dir);
-	return 0;
+		return dget(sdp->sd_root_dir);
 
 error_super:
 	deactivate_locked_super(s);
+	return ERR_PTR(error);
 error_bdev:
 	close_bdev_exclusive(bdev, mode);
-	return error;
+	return ERR_PTR(error);
 }
 
 static int set_meta_super(struct super_block *s, void *ptr)
@@ -1356,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
 	return -EINVAL;
 }
 
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
 {
 	struct super_block *s;
 	struct gfs2_sbd *sdp;
@@ -1368,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return error;
+		return ERR_PTR(error);
 	}
 	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
 	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 	}
 	if ((flags ^ s->s_flags) & MS_RDONLY) {
 		deactivate_locked_super(s);
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 	}
 	sdp = s->s_fs_info;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = dget(sdp->sd_master_dir);
-	return 0;
+	return dget(sdp->sd_master_dir);
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1410,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 struct file_system_type gfs2_fs_type = {
 	.name = "gfs2",
 	.fs_flags = FS_REQUIRES_DEV,
-	.get_sb = gfs2_get_sb,
+	.mount = gfs2_mount,
 	.kill_sb = gfs2_kill_sb,
 	.owner = THIS_MODULE,
 };
@@ -1418,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
 struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
 	.fs_flags = FS_REQUIRES_DEV,
-	.get_sb = gfs2_get_sb_meta,
+	.mount = gfs2_mount_meta,
 	.owner = THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From a7f9fb205a88ab9af675a68fc554cf51dafc8b60 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 16:17:55 +0400
Subject: convert ceph

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/super.c | 50 +++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d6e0e0421891..08b460ae0539 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -635,7 +635,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 		      const char *path)
 {
 	int err;
@@ -678,16 +678,14 @@ static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
 		}
 	}
 
-	mnt->mnt_root = root;
-	mnt->mnt_sb = fsc->sb;
-
 	fsc->mount_state = CEPH_MOUNT_MOUNTED;
 	dout("mount success\n");
-	err = 0;
+	mutex_unlock(&fsc->client->mount_mutex);
+	return root;
 
 out:
 	mutex_unlock(&fsc->client->mount_mutex);
-	return err;
+	return ERR_PTR(err);
 
 fail:
 	if (first) {
@@ -777,41 +775,45 @@ static int ceph_register_bdi(struct super_block *sb,
 	return err;
 }
 
-static int ceph_get_sb(struct file_system_type *fs_type,
-		       int flags, const char *dev_name, void *data,
-		       struct vfsmount *mnt)
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data)
 {
 	struct super_block *sb;
 	struct ceph_fs_client *fsc;
+	struct dentry *res;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 	const char *path = NULL;
 	struct ceph_mount_options *fsopt = NULL;
 	struct ceph_options *opt = NULL;
 
-	dout("ceph_get_sb\n");
+	dout("ceph_mount\n");
 	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
-	if (err < 0)
+	if (err < 0) {
+		res = ERR_PTR(err);
 		goto out_final;
+	}
 
 	/* create client (which we may/may not use) */
 	fsc = create_fs_client(fsopt, opt);
 	if (IS_ERR(fsc)) {
-		err = PTR_ERR(fsc);
+		res = ERR_CAST(fsc);
 		kfree(fsopt);
 		kfree(opt);
 		goto out_final;
 	}
 
 	err = ceph_mdsc_init(fsc);
-	if (err < 0)
+	if (err < 0) {
+		res = ERR_PTR(err);
 		goto out;
+	}
 
 	if (ceph_test_opt(fsc->client, NOSHARE))
 		compare_super = NULL;
 	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
 	if (IS_ERR(sb)) {
-		err = PTR_ERR(sb);
+		res = ERR_CAST(sb);
 		goto out;
 	}
 
@@ -823,16 +825,18 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 	} else {
 		dout("get_sb using new client %p\n", fsc);
 		err = ceph_register_bdi(sb, fsc);
-		if (err < 0)
+		if (err < 0) {
+			res = ERR_PTR(err);
 			goto out_splat;
+		}
 	}
 
-	err = ceph_mount(fsc, mnt, path);
-	if (err < 0)
+	res = ceph_real_mount(fsc, path);
+	if (IS_ERR(res))
 		goto out_splat;
-	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
-	     mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
-	return 0;
+	dout("root %p inode %p ino %llx.%llx\n", res,
+	     res->d_inode, ceph_vinop(res->d_inode));
+	return res;
 
 out_splat:
 	ceph_mdsc_close_sessions(fsc->mdsc);
@@ -843,8 +847,8 @@ out:
 	ceph_mdsc_destroy(fsc);
 	destroy_fs_client(fsc);
 out_final:
-	dout("ceph_get_sb fail %d\n", err);
-	return err;
+	dout("ceph_mount fail %ld\n", PTR_ERR(res));
+	return res;
 }
 
 static void ceph_kill_sb(struct super_block *s)
@@ -860,7 +864,7 @@ static void ceph_kill_sb(struct super_block *s)
 static struct file_system_type ceph_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ceph",
-	.get_sb		= ceph_get_sb,
+	.mount		= ceph_mount,
 	.kill_sb	= ceph_kill_sb,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE,
 };
-- 
cgit v1.2.3


From 061dbc6b9010bc1a30ef9a1da5469aefa83abd7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Jul 2010 16:21:33 +0400
Subject: convert btrfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 144f8a5730f5..ebe46c628748 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -560,8 +560,8 @@ static int btrfs_test_super(struct super_block *s, void *data)
  * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
  *	  for multiple device setup.  Make sure to keep it in sync.
  */
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data)
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
@@ -580,7 +580,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 					  &subvol_name, &subvol_objectid,
 					  &fs_devices);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 
 	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
 	if (error)
@@ -656,11 +656,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		root = new_root;
 	}
 
-	mnt->mnt_sb = s;
-	mnt->mnt_root = root;
-
 	kfree(subvol_name);
-	return 0;
+	return root;
 
 error_s:
 	error = PTR_ERR(s);
@@ -669,7 +666,7 @@ error_close_devices:
 error_free_subvol_name:
 	kfree(subvol_name);
 error:
-	return error;
+	return ERR_PTR(error);
 }
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -746,7 +743,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
-	.get_sb		= btrfs_get_sb,
+	.mount		= btrfs_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
-- 
cgit v1.2.3


From 31f43471e97eff7801251e2c3c8fc03219f85d87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Oct 2010 03:38:12 -0400
Subject: convert simple cases of nfs-related ->get_sb() to ->mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/super.c | 96 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 46 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3600ec700d58..0a42e8f4adcb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -260,8 +260,8 @@ static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_xdev_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -277,7 +277,7 @@ static struct file_system_type nfs_fs_type = {
 struct file_system_type nfs_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs",
-	.get_sb		= nfs_xdev_get_sb,
+	.mount		= nfs_xdev_mount,
 	.kill_sb	= nfs_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -302,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
 	struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
 
 static struct file_system_type nfs4_fs_type = {
@@ -323,7 +323,7 @@ static struct file_system_type nfs4_fs_type = {
 static struct file_system_type nfs4_remote_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_remote_get_sb,
+	.mount		= nfs4_remote_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -331,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = {
 struct file_system_type nfs4_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_xdev_get_sb,
+	.mount		= nfs4_xdev_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -339,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = {
 static struct file_system_type nfs4_remote_referral_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_remote_referral_get_sb,
+	.mount		= nfs4_remote_referral_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -2397,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s)
 /*
  * Clone an NFS2/3 server record on xdev traversal (FSID-change)
  */
-static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
-			   const char *dev_name, void *raw_data,
-			   struct vfsmount *mnt)
+static struct dentry *
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	struct super_block *s;
@@ -2411,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	};
 	int error;
 
-	dprintk("--> nfs_xdev_get_sb()\n");
+	dprintk("--> nfs_xdev_mount()\n");
 
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2458,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
 
 	/* clone any lsm security options from the parent to the new sb */
 	security_sb_clone_mnt_opts(data->sb, s);
 
-	dprintk("<-- nfs_xdev_get_sb() = 0\n");
-	return 0;
+	dprintk("<-- nfs_xdev_mount() = 0\n");
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
 out_err_noserver:
-	dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
-	return error;
+	dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
+	return ERR_PTR(error);
 
 error_splat_super:
 	if (server && !s->s_root)
 		bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
 	deactivate_locked_super(s);
-	dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
-	return error;
+	dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
+	return ERR_PTR(error);
 }
 
 #ifdef CONFIG_NFS_V4
@@ -2649,8 +2647,9 @@ out_no_address:
 /*
  * Get the superblock for the NFS4 root partition
  */
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+static struct dentry *
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+		  const char *dev_name, void *raw_data)
 {
 	struct nfs_parsed_mount_data *data = raw_data;
 	struct super_block *s;
@@ -2714,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 		goto error_splat_root;
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
-	error = 0;
+
+	security_free_mnt_opts(&data->lsm_opts);
+	nfs_free_fhandle(mntfh);
+	return mntroot;
 
 out:
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	nfs_free_fhandle(mntfh);
-	return error;
+	return ERR_PTR(error);
 
 out_free:
 	nfs_free_server(server);
@@ -2968,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb)
 /*
  * Clone an NFS4 server record on xdev traversal (FSID-change)
  */
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *raw_data,
-			    struct vfsmount *mnt)
+static struct dentry *
+nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
+		 const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	struct super_block *s;
@@ -2982,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	};
 	int error;
 
-	dprintk("--> nfs4_xdev_get_sb()\n");
+	dprintk("--> nfs4_xdev_mount()\n");
 
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -3029,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
 
 	security_sb_clone_mnt_opts(data->sb, s);
 
-	dprintk("<-- nfs4_xdev_get_sb() = 0\n");
-	return 0;
+	dprintk("<-- nfs4_xdev_mount() = 0\n");
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
 out_err_noserver:
-	dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
-	return error;
+	dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
+	return ERR_PTR(error);
 
 error_splat_super:
 	if (server && !s->s_root)
 		bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
 	deactivate_locked_super(s);
-	dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
-	return error;
+	dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
+	return ERR_PTR(error);
 }
 
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data,
-		struct vfsmount *mnt)
+static struct dentry *
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
+			   const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	struct super_block *s;
@@ -3118,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 	}
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
 
 	security_sb_clone_mnt_opts(data->sb, s);
 
 	nfs_free_fhandle(mntfh);
 	dprintk("<-- nfs4_referral_get_sb() = 0\n");
-	return 0;
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
@@ -3133,7 +3129,7 @@ out_err_noserver:
 	nfs_free_fhandle(mntfh);
 out_err_nofh:
 	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-	return error;
+	return ERR_PTR(error);
 
 error_splat_super:
 	if (server && !s->s_root)
@@ -3142,7 +3138,7 @@ error_splat_bdi:
 	deactivate_locked_super(s);
 	nfs_free_fhandle(mntfh);
 	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
-- 
cgit v1.2.3


From a4cdbd8bfb87ceff455aae85727077889b75001b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Oct 2010 05:49:13 -0400
Subject: braino in internal.h

wrong return type...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index ebad3b90752d..e43b9a4dbf4e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,5 +106,5 @@ extern void release_open_intent(struct nameidata *);
  * inode.c
  */
 extern int get_nr_dirty_inodes(void);
-extern int evict_inodes(struct super_block *);
+extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *);
-- 
cgit v1.2.3


From 0cb59c9953171e9adf6da8142a5c85ceb77bb60d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 2 Jul 2010 12:14:14 -0400
Subject: Btrfs: write out free space cache

This is a simple bit, just dump the free space cache out to our preallocated
inode when we're writing out dirty block groups.  There are a bunch of changes
in inode.c in order to account for special cases.  Mostly when we're doing the
writeout we're holding trans_mutex, so we need to use the nolock transacation
functions.  Also we can't do asynchronous completions since the async thread
could be blocked on already completed IO waiting for the transaction lock.  This
has been tested with xfstests and btrfs filesystem balance, as well as my ENOSPC
tests.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |   1 +
 fs/btrfs/disk-io.c          |  17 ++-
 fs/btrfs/extent-tree.c      |  48 +++++++
 fs/btrfs/free-space-cache.c | 302 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/free-space-cache.h |   5 +
 fs/btrfs/inode.c            |  60 +++++++--
 6 files changed, 420 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 46f52e1beade..2c06b37cda75 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -982,6 +982,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
+	struct btrfs_workers endio_freespace_worker;
 	struct btrfs_workers submit_workers;
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 45cf64fc1e3e..77e5dabfd45a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -481,9 +481,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->work.flags = 0;
 
 	if (bio->bi_rw & REQ_WRITE) {
-		if (end_io_wq->metadata)
+		if (end_io_wq->metadata == 1)
 			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 					   &end_io_wq->work);
+		else if (end_io_wq->metadata == 2)
+			btrfs_queue_worker(&fs_info->endio_freespace_worker,
+					   &end_io_wq->work);
 		else
 			btrfs_queue_worker(&fs_info->endio_write_workers,
 					   &end_io_wq->work);
@@ -497,6 +500,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	}
 }
 
+/*
+ * For the metadata arg you want
+ *
+ * 0 - if data
+ * 1 - if normal metadta
+ * 2 - if writing to the free space cache area
+ */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata)
 {
@@ -1774,6 +1784,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+			   1, &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -1794,6 +1806,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
 	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
 	btrfs_start_workers(&fs_info->endio_write_workers, 1);
+	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2035,6 +2048,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -2468,6 +2482,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aab40fb3faed..d5455a2bf60b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2847,6 +2847,8 @@ again:
 			continue;
 		}
 
+		if (cache->disk_cache_state == BTRFS_DC_SETUP)
+			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
 		cache->dirty = 0;
 		last = cache->key.objectid + cache->key.offset;
 
@@ -2855,6 +2857,52 @@ again:
 		btrfs_put_block_group(cache);
 	}
 
+	while (1) {
+		/*
+		 * I don't think this is needed since we're just marking our
+		 * preallocated extent as written, but just in case it can't
+		 * hurt.
+		 */
+		if (last == 0) {
+			err = btrfs_run_delayed_refs(trans, root,
+						     (unsigned long)-1);
+			BUG_ON(err);
+		}
+
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			/*
+			 * Really this shouldn't happen, but it could if we
+			 * couldn't write the entire preallocated extent and
+			 * splitting the extent resulted in a new block.
+			 */
+			if (cache->dirty) {
+				btrfs_put_block_group(cache);
+				goto again;
+			}
+			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		btrfs_write_out_cache(root, trans, cache, path);
+
+		/*
+		 * If we didn't have an error then the cache state is still
+		 * NEED_WRITE, so we can set it to WRITTEN.
+		 */
+		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+			cache->disk_cache_state = BTRFS_DC_WRITTEN;
+		last = cache->key.objectid + cache->key.offset;
+		btrfs_put_block_group(cache);
+	}
+
 	btrfs_free_path(path);
 	return 0;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 05efcc7061a7..7f972e59cc04 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -28,6 +28,11 @@
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
 
+static void recalculate_thresholds(struct btrfs_block_group_cache
+				   *block_group);
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+			   struct btrfs_free_space *info);
+
 struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				      struct btrfs_block_group_cache
 				      *block_group, struct btrfs_path *path)
@@ -182,6 +187,303 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	return btrfs_update_inode(trans, root, inode);
 }
 
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path)
+{
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct inode *inode;
+	struct rb_node *node;
+	struct list_head *pos, *n;
+	struct page *page;
+	struct extent_state *cached_state = NULL;
+	struct list_head bitmap_list;
+	struct btrfs_key key;
+	u64 bytes = 0;
+	u32 *crc, *checksums;
+	pgoff_t index = 0, last_index = 0;
+	unsigned long first_page_offset;
+	int num_checksums;
+	int entries = 0;
+	int bitmaps = 0;
+	int ret = 0;
+
+	root = root->fs_info->tree_root;
+
+	INIT_LIST_HEAD(&bitmap_list);
+
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode))
+		return 0;
+
+	if (!i_size_read(inode)) {
+		iput(inode);
+		return 0;
+	}
+
+	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	filemap_write_and_wait(inode->i_mapping);
+	btrfs_wait_ordered_range(inode, inode->i_size &
+				 ~(root->sectorsize - 1), (u64)-1);
+
+	/* We need a checksum per page. */
+	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+	crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+	if (!crc) {
+		iput(inode);
+		return 0;
+	}
+
+	/* Since the first page has all of our checksums and our generation we
+	 * need to calculate the offset into the page that we can start writing
+	 * our entries.
+	 */
+	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+
+	node = rb_first(&block_group->free_space_offset);
+	if (!node)
+		goto out_free;
+
+	/*
+	 * Lock all pages first so we can lock the extent safely.
+	 *
+	 * NOTE: Because we hold the ref the entire time we're going to write to
+	 * the page find_get_page should never fail, so we don't do a check
+	 * after find_get_page at this point.  Just putting this here so people
+	 * know and don't freak out.
+	 */
+	while (index <= last_index) {
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page) {
+			pgoff_t i = 0;
+
+			while (i < index) {
+				page = find_get_page(inode->i_mapping, i);
+				unlock_page(page);
+				page_cache_release(page);
+				page_cache_release(page);
+				i++;
+			}
+			goto out_free;
+		}
+		index++;
+	}
+
+	index = 0;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+			 0, &cached_state, GFP_NOFS);
+
+	/* Write out the extent entries */
+	do {
+		struct btrfs_free_space_entry *entry;
+		void *addr;
+		unsigned long offset = 0;
+		unsigned long start_offset = 0;
+
+		if (index == 0) {
+			start_offset = first_page_offset;
+			offset = start_offset;
+		}
+
+		page = find_get_page(inode->i_mapping, index);
+
+		addr = kmap(page);
+		entry = addr + start_offset;
+
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		while (1) {
+			struct btrfs_free_space *e;
+
+			e = rb_entry(node, struct btrfs_free_space, offset_index);
+			entries++;
+
+			entry->offset = cpu_to_le64(e->offset);
+			entry->bytes = cpu_to_le64(e->bytes);
+			if (e->bitmap) {
+				entry->type = BTRFS_FREE_SPACE_BITMAP;
+				list_add_tail(&e->list, &bitmap_list);
+				bitmaps++;
+			} else {
+				entry->type = BTRFS_FREE_SPACE_EXTENT;
+			}
+			node = rb_next(node);
+			if (!node)
+				break;
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				break;
+			entry++;
+		}
+		*crc = ~(u32)0;
+		*crc = btrfs_csum_data(root, addr + start_offset, *crc,
+				       PAGE_CACHE_SIZE - start_offset);
+		kunmap(page);
+
+		btrfs_csum_final(*crc, (char *)crc);
+		crc++;
+
+		bytes += PAGE_CACHE_SIZE;
+
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+
+		/*
+		 * We need to release our reference we got for grab_cache_page,
+		 * except for the first page which will hold our checksums, we
+		 * do that below.
+		 */
+		if (index != 0) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+
+		page_cache_release(page);
+
+		index++;
+	} while (node);
+
+	/* Write out the bitmaps */
+	list_for_each_safe(pos, n, &bitmap_list) {
+		void *addr;
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+
+		page = find_get_page(inode->i_mapping, index);
+
+		addr = kmap(page);
+		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
+		*crc = ~(u32)0;
+		*crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
+		kunmap(page);
+		btrfs_csum_final(*crc, (char *)crc);
+		crc++;
+		bytes += PAGE_CACHE_SIZE;
+
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		page_cache_release(page);
+		list_del_init(&entry->list);
+		index++;
+	}
+
+	/* Zero out the rest of the pages just to make sure */
+	while (index <= last_index) {
+		void *addr;
+
+		page = find_get_page(inode->i_mapping, index);
+
+		addr = kmap(page);
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		kunmap(page);
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		page_cache_release(page);
+		bytes += PAGE_CACHE_SIZE;
+		index++;
+	}
+
+	btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
+
+	/* Write the checksums and trans id to the first page */
+	{
+		void *addr;
+		u64 *gen;
+
+		page = find_get_page(inode->i_mapping, 0);
+
+		addr = kmap(page);
+		memcpy(addr, checksums, sizeof(u32) * num_checksums);
+		gen = addr + (sizeof(u32) * num_checksums);
+		*gen = trans->transid;
+		kunmap(page);
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		page_cache_release(page);
+	}
+	BTRFS_I(inode)->generation = trans->transid;
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+	filemap_write_and_wait(inode->i_mapping);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+	if (ret < 0) {
+		ret = 0;
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+		goto out_free;
+	}
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		struct btrfs_key found_key;
+		BUG_ON(!path->slots[0]);
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+		    found_key.offset != block_group->key.objectid) {
+			ret = 0;
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+					 GFP_NOFS);
+			btrfs_release_path(root, path);
+			goto out_free;
+		}
+	}
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_set_free_space_entries(leaf, header, entries);
+	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+	btrfs_set_free_space_generation(leaf, header, trans->transid);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	ret = 1;
+
+out_free:
+	if (ret == 0) {
+		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+		BTRFS_I(inode)->generation = 0;
+	}
+	kfree(checksums);
+	btrfs_update_inode(trans, root, inode);
+	iput(inode);
+	return ret;
+}
+
 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
 					  u64 offset)
 {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 45be29e5f01e..189f740bd3c0 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -34,10 +34,15 @@ int create_free_space_inode(struct btrfs_root *root,
 			    struct btrfs_trans_handle *trans,
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_path *path);
+
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
 				    struct inode *inode);
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path);
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1af1ea88e8a8..f2fb974ed8f0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -764,6 +764,7 @@ static noinline int cow_file_range(struct inode *inode,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
+	BUG_ON(root == root->fs_info->tree_root);
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
@@ -1035,10 +1036,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	int type;
 	int nocow;
 	int check_prev = 1;
+	bool nolock = false;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	trans = btrfs_join_transaction(root, 1);
+	if (root == root->fs_info->tree_root) {
+		nolock = true;
+		trans = btrfs_join_transaction_nolock(root, 1);
+	} else {
+		trans = btrfs_join_transaction(root, 1);
+	}
 	BUG_ON(!trans);
 
 	cow_start = (u64)-1;
@@ -1211,8 +1218,13 @@ out_check:
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
+	if (nolock) {
+		ret = btrfs_end_transaction_nolock(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+	}
 	btrfs_free_path(path);
 	return 0;
 }
@@ -1289,6 +1301,8 @@ static int btrfs_set_bit_hook(struct inode *inode,
 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
+		int do_list = (root->root_key.objectid !=
+			       BTRFS_ROOT_TREE_OBJECTID);
 
 		if (*bits & EXTENT_FIRST_DELALLOC)
 			*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1312,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += len;
 		root->fs_info->delalloc_bytes += len;
-		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
 				      &root->fs_info->delalloc_inodes);
 		}
@@ -1321,6 +1335,8 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
+		int do_list = (root->root_key.objectid !=
+			       BTRFS_ROOT_TREE_OBJECTID);
 
 		if (*bits & EXTENT_FIRST_DELALLOC)
 			*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1346,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 		if (*bits & EXTENT_DO_ACCOUNTING)
 			btrfs_delalloc_release_metadata(inode, len);
 
-		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+		    && do_list)
 			btrfs_free_reserved_data_space(inode, len);
 
 		spin_lock(&root->fs_info->delalloc_lock);
 		root->fs_info->delalloc_bytes -= len;
 		BTRFS_I(inode)->delalloc_bytes -= len;
 
-		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
 		}
@@ -1426,7 +1443,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (root == root->fs_info->tree_root)
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+	else
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
 	if (!(rw & REQ_WRITE)) {
@@ -1662,6 +1682,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct extent_state *cached_state = NULL;
 	int compressed = 0;
 	int ret;
+	bool nolock = false;
 
 	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
 					     end - start + 1);
@@ -1669,11 +1690,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		return 0;
 	BUG_ON(!ordered_extent);
 
+	nolock = (root == root->fs_info->tree_root);
+
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list));
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (!ret) {
-			trans = btrfs_join_transaction(root, 1);
+			if (nolock)
+				trans = btrfs_join_transaction_nolock(root, 1);
+			else
+				trans = btrfs_join_transaction(root, 1);
+			BUG_ON(!trans);
 			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
@@ -1686,7 +1713,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 			 ordered_extent->file_offset + ordered_extent->len - 1,
 			 0, &cached_state, GFP_NOFS);
 
-	trans = btrfs_join_transaction(root, 1);
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root, 1);
+	else
+		trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -1725,9 +1755,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (trans)
-		btrfs_end_transaction(trans, root);
+	if (nolock) {
+		if (trans)
+			btrfs_end_transaction_nolock(trans, root);
+	} else {
+		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+		if (trans)
+			btrfs_end_transaction(trans, root);
+	}
+
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
-- 
cgit v1.2.3


From 9d66e233c7042da27ec699453770f41e567a0442 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 25 Aug 2010 16:54:15 -0400
Subject: Btrfs: load free space cache if it exists

This patch actually loads the free space cache if it exists.  The only thing
that really changes here is that we need to cache the block group if we're going
to remove an extent from it.  Previously we did not do this since the caching
kthread would pick it up.  With the on disk cache we don't have this luxury so
we need to make sure we read the on disk cache in first, and then remove the
extent, that way when the extent is unpinned the free space is added to the
block group.  This has been tested with all sorts of things.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c      |  50 +++++++-
 fs/btrfs/free-space-cache.c | 296 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/free-space-cache.h |   2 +
 3 files changed, 345 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d5455a2bf60b..9a325e465ad9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -421,7 +421,9 @@ err:
 	return 0;
 }
 
-static int cache_block_group(struct btrfs_block_group_cache *cache)
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+			     struct btrfs_trans_handle *trans,
+			     int load_cache_only)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_caching_control *caching_ctl;
@@ -432,6 +434,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
 	if (cache->cached != BTRFS_CACHE_NO)
 		return 0;
 
+	/*
+	 * We can't do the read from on-disk cache during a commit since we need
+	 * to have the normal tree locking.
+	 */
+	if (!trans->transaction->in_commit) {
+		spin_lock(&cache->lock);
+		if (cache->cached != BTRFS_CACHE_NO) {
+			spin_unlock(&cache->lock);
+			return 0;
+		}
+		cache->cached = BTRFS_CACHE_STARTED;
+		spin_unlock(&cache->lock);
+
+		ret = load_free_space_cache(fs_info, cache);
+
+		spin_lock(&cache->lock);
+		if (ret == 1) {
+			cache->cached = BTRFS_CACHE_FINISHED;
+			cache->last_byte_to_unpin = (u64)-1;
+		} else {
+			cache->cached = BTRFS_CACHE_NO;
+		}
+		spin_unlock(&cache->lock);
+		if (ret == 1)
+			return 0;
+	}
+
+	if (load_cache_only)
+		return 0;
+
 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
 	BUG_ON(!caching_ctl);
 
@@ -3984,6 +4016,14 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			factor = 2;
 		else
 			factor = 1;
+		/*
+		 * If this block group has free space cache written out, we
+		 * need to make sure to load it if we are removing space.  This
+		 * is because we need the unpinning stage to actually add the
+		 * space back to the block group, otherwise we will leak space.
+		 */
+		if (!alloc && cache->cached == BTRFS_CACHE_NO)
+			cache_block_group(cache, trans, 1);
 
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
@@ -4828,6 +4868,10 @@ have_block_group:
 		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
 			u64 free_percent;
 
+			ret = cache_block_group(block_group, trans, 1);
+			if (block_group->cached == BTRFS_CACHE_FINISHED)
+				goto have_block_group;
+
 			free_percent = btrfs_block_group_used(&block_group->item);
 			free_percent *= 100;
 			free_percent = div64_u64(free_percent,
@@ -4848,7 +4892,7 @@ have_block_group:
 			if (loop > LOOP_CACHING_NOWAIT ||
 			    (loop > LOOP_FIND_IDEAL &&
 			     atomic_read(&space_info->caching_threads) < 2)) {
-				ret = cache_block_group(block_group);
+				ret = cache_block_group(block_group, trans, 0);
 				BUG_ON(ret);
 			}
 			found_uncached_bg = true;
@@ -5405,7 +5449,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	u64 num_bytes = ins->offset;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	cache_block_group(block_group);
+	cache_block_group(block_group, trans, 0);
 	caching_ctl = get_caching_control(block_group);
 
 	if (!caching_ctl) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 7f972e59cc04..baa193423fb8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -187,6 +187,302 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	return btrfs_update_inode(trans, root, inode);
 }
 
+static int readahead_cache(struct inode *inode)
+{
+	struct file_ra_state *ra;
+	unsigned long last_index;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	file_ra_state_init(ra, inode->i_mapping);
+	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+
+	kfree(ra);
+
+	return 0;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_root *root = fs_info->tree_root;
+	struct inode *inode;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct page *page;
+	struct btrfs_path *path;
+	u32 *checksums = NULL, *crc;
+	char *disk_crcs = NULL;
+	struct btrfs_key key;
+	struct list_head bitmaps;
+	u64 num_entries;
+	u64 num_bitmaps;
+	u64 generation;
+	u32 cur_crc = ~(u32)0;
+	pgoff_t index = 0;
+	unsigned long first_page_offset;
+	int num_checksums;
+	int ret = 0;
+
+	/*
+	 * If we're unmounting then just return, since this does a search on the
+	 * normal root and not the commit root and we could deadlock.
+	 */
+	smp_mb();
+	if (fs_info->closing)
+		return 0;
+
+	/*
+	 * If this block group has been marked to be cleared for one reason or
+	 * another then we can't trust the on disk cache, so just return.
+	 */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		printk(KERN_ERR "not reading block group %llu, dcs is %d\n", block_group->key.objectid,
+		       block_group->disk_cache_state);
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	INIT_LIST_HEAD(&bitmaps);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode)) {
+		btrfs_free_path(path);
+		return 0;
+	}
+
+	/* Nothing in the space cache, goodbye */
+	if (!i_size_read(inode)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret) {
+		btrfs_free_path(path);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	num_entries = btrfs_free_space_entries(leaf, header);
+	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+	generation = btrfs_free_space_generation(leaf, header);
+	btrfs_free_path(path);
+
+	if (BTRFS_I(inode)->generation != generation) {
+		printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
+		       " not match free space cache generation (%llu) for "
+		       "block group %llu\n",
+		       (unsigned long long)BTRFS_I(inode)->generation,
+		       (unsigned long long)generation,
+		       (unsigned long long)block_group->key.objectid);
+		goto out;
+	}
+
+	if (!num_entries)
+		goto out;
+
+	/* Setup everything for doing checksumming */
+	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+	checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+	if (!checksums)
+		goto out;
+	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+	disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
+	if (!disk_crcs)
+		goto out;
+
+	ret = readahead_cache(inode);
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	while (1) {
+		struct btrfs_free_space_entry *entry;
+		struct btrfs_free_space *e;
+		void *addr;
+		unsigned long offset = 0;
+		unsigned long start_offset = 0;
+		int need_loop = 0;
+
+		if (!num_entries && !num_bitmaps)
+			break;
+
+		if (index == 0) {
+			start_offset = first_page_offset;
+			offset = start_offset;
+		}
+
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page) {
+			ret = 0;
+			goto free_cache;
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				printk(KERN_ERR "btrfs: error reading free "
+				       "space cache: %llu\n",
+				       (unsigned long long)
+				       block_group->key.objectid);
+				goto free_cache;
+			}
+		}
+		addr = kmap(page);
+
+		if (index == 0) {
+			u64 *gen;
+
+			memcpy(disk_crcs, addr, first_page_offset);
+			gen = addr + (sizeof(u32) * num_checksums);
+			if (*gen != BTRFS_I(inode)->generation) {
+				printk(KERN_ERR "btrfs: space cache generation"
+				       " (%llu) does not match inode (%llu) "
+				       "for block group %llu\n",
+				       (unsigned long long)*gen,
+				       (unsigned long long)
+				       BTRFS_I(inode)->generation,
+				       (unsigned long long)
+				       block_group->key.objectid);
+				kunmap(page);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+			crc = (u32 *)disk_crcs;
+		}
+		entry = addr + start_offset;
+
+		/* First lets check our crc before we do anything fun */
+		cur_crc = ~(u32)0;
+		cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
+					  PAGE_CACHE_SIZE - start_offset);
+		btrfs_csum_final(cur_crc, (char *)&cur_crc);
+		if (cur_crc != *crc) {
+			printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
+			       "block group %llu\n", index,
+			       (unsigned long long)block_group->key.objectid);
+			kunmap(page);
+			unlock_page(page);
+			page_cache_release(page);
+			goto free_cache;
+		}
+		crc++;
+
+		while (1) {
+			if (!num_entries)
+				break;
+
+			need_loop = 1;
+			e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+			if (!e) {
+				kunmap(page);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+
+			e->offset = le64_to_cpu(entry->offset);
+			e->bytes = le64_to_cpu(entry->bytes);
+			if (!e->bytes) {
+				kunmap(page);
+				kfree(e);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+
+			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
+				spin_lock(&block_group->tree_lock);
+				ret = link_free_space(block_group, e);
+				spin_unlock(&block_group->tree_lock);
+				BUG_ON(ret);
+			} else {
+				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+				if (!e->bitmap) {
+					kunmap(page);
+					kfree(e);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
+				spin_lock(&block_group->tree_lock);
+				ret = link_free_space(block_group, e);
+				block_group->total_bitmaps++;
+				recalculate_thresholds(block_group);
+				spin_unlock(&block_group->tree_lock);
+				list_add_tail(&e->list, &bitmaps);
+			}
+
+			num_entries--;
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				break;
+			entry++;
+		}
+
+		/*
+		 * We read an entry out of this page, we need to move on to the
+		 * next page.
+		 */
+		if (need_loop) {
+			kunmap(page);
+			goto next;
+		}
+
+		/*
+		 * We add the bitmaps at the end of the entries in order that
+		 * the bitmap entries are added to the cache.
+		 */
+		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+		list_del_init(&e->list);
+		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+		kunmap(page);
+		num_bitmaps--;
+next:
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+
+	ret = 1;
+out:
+	kfree(checksums);
+	kfree(disk_crcs);
+	iput(inode);
+	return ret;
+
+free_cache:
+	/* This cache is bogus, make sure it gets cleared */
+	spin_lock(&block_group->lock);
+	block_group->disk_cache_state = BTRFS_DC_CLEAR;
+	spin_unlock(&block_group->lock);
+	btrfs_remove_free_space_cache(block_group);
+	goto out;
+}
+
 int btrfs_write_out_cache(struct btrfs_root *root,
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 189f740bd3c0..e49ca5c321b5 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -39,6 +39,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
 				    struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group);
 int btrfs_write_out_cache(struct btrfs_root *root,
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
-- 
cgit v1.2.3


From dde5abee12327d59f968bbfc8151e1b04082a2c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 16 Sep 2010 16:17:03 -0400
Subject: Btrfs: check cache->caching_ctl before returning if caching has
 started

With the free space disk caching we can mark the block group as started with the
caching, but we don't have a caching ctl.  This can race with anybody else who
tries to get the caching ctl before we cache (this is very hard to do btw).  So
instead check to see if cache->caching_ctl is set, and if not return NULL.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9a325e465ad9..5c9ef3ac25e1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
 		return NULL;
 	}
 
+	/* We're loading it the fast way, so we don't have a caching_ctl. */
+	if (!cache->caching_ctl) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
 	ctl = cache->caching_ctl;
 	atomic_inc(&ctl->count);
 	spin_unlock(&cache->lock);
-- 
cgit v1.2.3


From 67377734fd24c32cbdfeb697c2e2bd7fed519e75 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 16 Sep 2010 16:19:09 -0400
Subject: Btrfs: add support for mixed data+metadata block groups

There are just a few things that need to be fixed in the kernel to support mixed
data+metadata block groups.  Mostly we just need to make sure that if we are
using mixed block groups that we continue to allocate mixed block groups as we
need them.  Also we need to make sure __find_space_info will find our space info
if we search for DATA or METADATA only.  Tested this with xfstests and it works
nicely.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       | 10 +++++++++-
 fs/btrfs/extent-tree.c | 22 +++++++++++++++++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c06b37cda75..b155a0e49eeb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -397,12 +397,14 @@ struct btrfs_super_block {
  */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
-	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -2046,6 +2048,12 @@ static inline struct dentry *fdentry(struct file *file)
 	return file->f_path.dentry;
 }
 
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
 /* extent-tree.c */
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5c9ef3ac25e1..137833e1fc26 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -547,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags == flags) {
+		if (found->flags & flags) {
 			rcu_read_unlock();
 			return found;
 		}
@@ -3266,6 +3266,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&space_info->lock);
 
+	/*
+	 * If we have mixed data/metadata chunks we want to make sure we keep
+	 * allocating mixed chunks instead of individual chunks.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
 	/*
 	 * if we're doing a data chunk, go ahead and make sure that
 	 * we keep a reasonable number of metadata chunks allocated in the
@@ -4787,6 +4794,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
+	bool use_cluster = true;
 	u64 ideal_cache_percent = 0;
 	u64 ideal_cache_offset = 0;
 
@@ -4801,16 +4809,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 	}
 
+	/*
+	 * If the space info is for both data and metadata it means we have a
+	 * small filesystem and we can't use the clustering stuff.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		use_cluster = false;
+
 	if (orig_root->ref_cows || empty_size)
 		allowed_chunk_alloc = 1;
 
-	if (data & BTRFS_BLOCK_GROUP_METADATA) {
+	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
 			empty_cluster = 64 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	    btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->data_alloc_cluster;
 	}
 
-- 
cgit v1.2.3


From 88c2ba3b069f1e0f4694124d02985fa7620a19f1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 21 Sep 2010 14:21:34 -0400
Subject: Btrfs: Add a clear_cache mount option

If something goes wrong with the free space cache we need a way to make sure
it's not loaded on mount and that it's cleared for everybody.  When you pass the
clear_cache option it will make it so all block groups are setup to be cleared,
which keeps them from being loaded and then they will be truncated when the
transaction is committed.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            | 1 +
 fs/btrfs/extent-tree.c      | 2 ++
 fs/btrfs/free-space-cache.c | 2 --
 fs/btrfs/super.c            | 6 +++++-
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b155a0e49eeb..633e559e000e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1229,6 +1229,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_DISCARD		(1 << 10)
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
 #define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 137833e1fc26..1a94ee4c4fbb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8198,6 +8198,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (cache_gen != 0 &&
 	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		need_clear = 1;
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index baa193423fb8..22ee0dc2e6b8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -242,8 +242,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 	 */
 	spin_lock(&block_group->lock);
 	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
-		printk(KERN_ERR "not reading block group %llu, dcs is %d\n", block_group->key.objectid,
-		       block_group->disk_cache_state);
 		spin_unlock(&block_group->lock);
 		return 0;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5c23eb8d6ba3..5f56213908e7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ enum {
 	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-	Opt_discard, Opt_space_cache, Opt_err,
+	Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -93,6 +93,7 @@ static match_table_t tokens = {
 	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_discard, "discard"},
 	{Opt_space_cache, "space_cache"},
+	{Opt_clear_cache, "clear_cache"},
 	{Opt_err, NULL},
 };
 
@@ -239,6 +240,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_space_cache:
 			printk(KERN_INFO "btrfs: enabling disk space caching\n");
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+		case Opt_clear_cache:
+			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
 			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
-- 
cgit v1.2.3


From 8216ef866df1119fd5a72372b8b29bce49c18590 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 28 Oct 2010 16:55:47 -0400
Subject: Btrfs: let the user know space caching is enabled

If you mount -o space_cache, the option will be persistent across mounts, but to
make sure the user knows that they did this, emit a message telling them if they
didn't mount with -o space_cache but the feature is still used.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1a94ee4c4fbb..d2a7ff53e99a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8200,6 +8200,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
+	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
-- 
cgit v1.2.3


From cb44921a09221f0a90217b44044448f63190f3e5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 24 Oct 2010 11:01:27 -0400
Subject: Btrfs: don't loop forever on bad btree blocks

When btrfs discovers the generation number in a btree block is
incorrect, it can loop forever without forcing the RAID
code to try a valid mirror, and without returning EIO.

This changes things to properly kick out the EIO.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc2..6921231e0efb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1577,13 +1577,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	blocksize = btrfs_level_size(root, level - 1);
 
 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-	if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-		/*
-		 * we found an up to date block without sleeping, return
-		 * right away
-		 */
-		*eb_ret = tmp;
-		return 0;
+	if (tmp) {
+		if (btrfs_buffer_uptodate(tmp, 0)) {
+			if (btrfs_buffer_uptodate(tmp, gen)) {
+				/*
+				 * we found an up to date block without
+				 * sleeping, return
+				 * right away
+				 */
+				*eb_ret = tmp;
+				return 0;
+			}
+			/* the pages were up to date, but we failed
+			 * the generation number check.  Do a full
+			 * read for the generation number that is correct.
+			 * We must do this without dropping locks so
+			 * we can trust our generation number
+			 */
+			free_extent_buffer(tmp);
+			tmp = read_tree_block(root, blocknr, blocksize, gen);
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				*eb_ret = tmp;
+				return 0;
+			}
+			free_extent_buffer(tmp);
+			btrfs_release_path(NULL, p);
+			return -EIO;
+		}
 	}
 
 	/*
@@ -1596,8 +1616,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	btrfs_unlock_up_safe(p, level + 1);
 	btrfs_set_path_blocking(p);
 
-	if (tmp)
-		free_extent_buffer(tmp);
+	free_extent_buffer(tmp);
 	if (p->reada)
 		reada_for_search(root, p, level, slot, key->objectid);
 
-- 
cgit v1.2.3


From 3259f8bed2f0f57c2fdcdac1b510c3fa319ef97e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 29 Oct 2010 11:16:17 -0400
Subject: Add new functions for triggering inode writeback

When btrfs is running low on metadata space, it needs to force delayed
allocation pages to disk.  It currently does this with a suboptimal walk
of a private list of inodes with delayed allocation, and it would be
much better if we used the generic flusher threads.

writeback_inodes_sb_if_idle would be ideal, but it waits for the flusher
thread to start IO on all the dirty pages in the FS before it returns.
This adds variants of writeback_inodes_sb* that allow the caller to
control how many pages get sent down.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/fs-writeback.c         | 52 ++++++++++++++++++++++++++++++++++++++---------
 include/linux/writeback.h |  2 ++
 2 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..1e23c33ea5cf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1069,33 +1069,44 @@ static void wait_sb_inodes(struct super_block *sb)
 }
 
 /**
- * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
  * @sb: the superblock
+ * @nr: the number of pages to write
  *
  * Start writeback on some inodes on this super_block. No guarantees are made
  * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO. The number of pages submitted is
- * returned.
+ * for IO completion of submitted IO.
  */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.done		= &done,
+		.nr_pages	= nr,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-	work.nr_pages = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 }
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+
+/**
+ * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+	return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) +
+			      global_page_state(NR_UNSTABLE_NFS) +
+			      (inodes_stat.nr_inodes - inodes_stat.nr_unused));
+}
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
@@ -1117,6 +1128,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
 }
 EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 
+/**
+ * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+				   unsigned long nr)
+{
+	if (!writeback_in_progress(sb->s_bdi)) {
+		down_read(&sb->s_umount);
+		writeback_inodes_sb_nr(sb, nr);
+		up_read(&sb->s_umount);
+		return 1;
+	} else
+		return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+
 /**
  * sync_inodes_sb	-	sync sb inode pages
  * @sb: the superblock
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 72a5d647a5f2..a4cf84511e79 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -60,7 +60,9 @@ struct writeback_control {
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
 int writeback_inodes_sb_if_idle(struct super_block *);
+int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wb(struct bdi_writeback *wb,
 		struct writeback_control *wbc);
-- 
cgit v1.2.3


From e5bc2458293b2af6c0b94435965c68cc70974b56 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 26 Oct 2010 13:37:56 -0400
Subject: Btrfs: tune the chunk allocation to 5% of the FS as metadata

An earlier commit tried to keep us from allocating too many
empty metadata chunks.  It was somewhat too restrictive and could
lead to ENOSPC errors on empty filesystems.

This increases the limits to about 5% of the FS size, allowing more
metadata chunks to be preallocated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 372fd224a11d..980d6a3c342c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -580,6 +580,15 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
+static u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
+
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner)
 {
@@ -3218,9 +3227,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-static int should_alloc_chunk(struct btrfs_space_info *sinfo, u64 alloc_bytes)
+static int should_alloc_chunk(struct btrfs_root *root,
+			      struct btrfs_space_info *sinfo, u64 alloc_bytes)
 {
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+	u64 thresh;
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved +
 	    alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3230,8 +3241,10 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, u64 alloc_bytes)
 	    alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
-	if (num_bytes > 256 * 1024 * 1024 &&
-	    sinfo->bytes_used < div_factor(num_bytes, 3))
+	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+
+	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
 		return 0;
 
 	return 1;
@@ -3265,7 +3278,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+	if (!force && !should_alloc_chunk(extent_root, space_info,
+					  alloc_bytes)) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
-- 
cgit v1.2.3


From bf9022e06af553553bc8f4e21ce36147ca6eae68 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 26 Oct 2010 13:40:45 -0400
Subject: Btrfs: use the flusher threads for delalloc throttling

We have a fairly complex set of loops around walking our list of
delalloc inodes when we find metadata delalloc space running low.
It doesn't work very well, can use large amounts of CPU and doesn't
do very efficient writeback.

This switches us to kick the bdi flusher threads instead.  All dirty
data in btrfs is accounted as delalloc data, so this is very similar
in terms of what it writes, but we're able to just kick off the IO
and wait for progress.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 980d6a3c342c..59c8daaacf0c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3328,15 +3328,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
-	int no_reclaim = 0;
 	int pause = 1;
-	int ret;
+	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
-	spin_lock(&space_info->lock);
+
+	smp_mb();
 	reserved = space_info->bytes_reserved;
-	spin_unlock(&space_info->lock);
 
 	if (reserved == 0)
 		return 0;
@@ -3344,20 +3343,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	max_reclaim = min(reserved, to_reclaim);
 
 	while (1) {
-		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync);
-		if (!ret) {
-			if (no_reclaim > 2)
-				break;
-			no_reclaim++;
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(pause);
-			pause <<= 1;
-			if (pause > HZ / 10)
-				pause = HZ / 10;
-		} else {
-			no_reclaim = 0;
-			pause = 1;
-		}
+		/* have the flusher threads jump in and do some IO */
+		smp_mb();
+		nr_pages = min_t(unsigned long, nr_pages,
+		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
 		if (reserved > space_info->bytes_reserved)
@@ -3370,6 +3360,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(pause);
+		pause <<= 1;
+		if (pause > HZ / 10)
+			pause = HZ / 10;
+
 	}
 	return reclaimed >= to_reclaim;
 }
-- 
cgit v1.2.3


From 897ca6e9b4fef86d5dfb6b31fd9f592ce6a47a42 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 26 Oct 2010 20:57:29 -0400
Subject: Btrfs: restructure try_release_extent_buffer()

restructure try_release_extent_buffer() and write a function to release the
extent buffer. It will be used later.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53a..6e3b326346a7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3104,6 +3104,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+						unsigned long start_idx)
+{
+	unsigned long index;
+	struct page *page;
+
+	if (!eb->first_page)
+		return;
+
+	index = num_extent_pages(eb->start, eb->len);
+	if (start_idx >= index)
+		return;
+
+	do {
+		index--;
+		page = extent_buffer_page(eb, index);
+		if (page)
+			page_cache_release(page);
+	} while (index != start_idx);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+	btrfs_release_extent_buffer_page(eb, 0);
+	__free_extent_buffer(eb);
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 					  u64 start, unsigned long len,
 					  struct page *page0,
@@ -3181,10 +3214,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 free_eb:
 	if (!atomic_dec_and_test(&eb->refs))
 		return exists;
-	for (index = 1; index < i; index++)
-		page_cache_release(extent_buffer_page(eb, index));
-	page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
+	btrfs_release_extent_buffer(eb);
 	return exists;
 }
 
@@ -3838,8 +3868,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 	u64 start = page_offset(page);
 	struct extent_buffer *eb;
 	int ret = 1;
-	unsigned long i;
-	unsigned long num_pages;
 
 	spin_lock(&tree->buffer_lock);
 	eb = buffer_search(tree, start);
@@ -3854,12 +3882,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 		ret = 0;
 		goto out;
 	}
-	/* at this point we can safely release the extent buffer */
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++)
-		page_cache_release(extent_buffer_page(eb, i));
+
 	rb_erase(&eb->rb_node, &tree->buffer);
-	__free_extent_buffer(eb);
+	/* at this point we can safely release the extent buffer */
+	btrfs_release_extent_buffer(eb);
 out:
 	spin_unlock(&tree->buffer_lock);
 	return ret;
-- 
cgit v1.2.3


From 19fe0a8b787d7c7f9318975b5a8c6e7e5e54e925 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 26 Oct 2010 20:57:29 -0400
Subject: Btrfs: Switch the extent buffer rbtree into a radix tree

This patch reduces the CPU time spent in the extent buffer search by using the
radix tree instead of the rbtree and using the rcu lock instead of the spin
lock.

I did a quick test by the benchmark tool[1] and found the patch improve the
file creation/deletion performance problem that I have reported[2].

Before applying this patch:
Create files:
	Total files: 50000
	Total time: 0.971531
	Average time: 0.000019
Delete files:
	Total files: 50000
	Total time: 1.366761
	Average time: 0.000027

After applying this patch:
Create files:
	Total files: 50000
	Total time: 0.927455
	Average time: 0.000019
Delete files:
	Total files: 50000
	Total time: 1.292280
	Average time: 0.000026

[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
[2] http://marc.info/?l=linux-btrfs&m=128212635122920&w=2

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 114 +++++++++++++++++++++------------------------------
 fs/btrfs/extent_io.h |   4 +-
 2 files changed, 49 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6e3b326346a7..4c639e156296 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 			  struct address_space *mapping, gfp_t mask)
 {
 	tree->state = RB_ROOT;
-	tree->buffer = RB_ROOT;
+	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 	return ret;
 }
 
-static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
-					  u64 offset, struct rb_node *node)
-{
-	struct rb_root *root = &tree->buffer;
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct extent_buffer *eb;
-
-	while (*p) {
-		parent = *p;
-		eb = rb_entry(parent, struct extent_buffer, rb_node);
-
-		if (offset < eb->start)
-			p = &(*p)->rb_left;
-		else if (offset > eb->start)
-			p = &(*p)->rb_right;
-		else
-			return eb;
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
-static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
-					   u64 offset)
-{
-	struct rb_root *root = &tree->buffer;
-	struct rb_node *n = root->rb_node;
-	struct extent_buffer *eb;
-
-	while (n) {
-		eb = rb_entry(n, struct extent_buffer, rb_node);
-		if (offset < eb->start)
-			n = n->rb_left;
-		else if (offset > eb->start)
-			n = n->rb_right;
-		else
-			return eb;
-	}
-	return NULL;
-}
-
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
@@ -3082,6 +3038,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	eb->len = len;
 	spin_lock_init(&eb->lock);
 	init_waitqueue_head(&eb->lock_wq);
+	INIT_RCU_HEAD(&eb->rcu_head);
 
 #if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
@@ -3150,16 +3107,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 1;
+	int ret;
 
-	spin_lock(&tree->buffer_lock);
-	eb = buffer_search(tree, start);
-	if (eb) {
-		atomic_inc(&eb->refs);
-		spin_unlock(&tree->buffer_lock);
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
 		mark_page_accessed(eb->first_page);
 		return eb;
 	}
-	spin_unlock(&tree->buffer_lock);
+	rcu_read_unlock();
 
 	eb = __alloc_extent_buffer(tree, start, len, mask);
 	if (!eb)
@@ -3198,17 +3155,25 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+
 	spin_lock(&tree->buffer_lock);
-	exists = buffer_tree_insert(tree, start, &eb->rb_node);
-	if (exists) {
+	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+	if (ret == -EEXIST) {
+		exists = radix_tree_lookup(&tree->buffer,
+						start >> PAGE_CACHE_SHIFT);
 		/* add one reference for the caller */
 		atomic_inc(&exists->refs);
 		spin_unlock(&tree->buffer_lock);
+		radix_tree_preload_end();
 		goto free_eb;
 	}
 	/* add one reference for the tree */
 	atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
+	radix_tree_preload_end();
 	return eb;
 
 free_eb:
@@ -3224,16 +3189,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 {
 	struct extent_buffer *eb;
 
-	spin_lock(&tree->buffer_lock);
-	eb = buffer_search(tree, start);
-	if (eb)
-		atomic_inc(&eb->refs);
-	spin_unlock(&tree->buffer_lock);
-
-	if (eb)
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
 		mark_page_accessed(eb->first_page);
+		return eb;
+	}
+	rcu_read_unlock();
 
-	return eb;
+	return NULL;
 }
 
 void free_extent_buffer(struct extent_buffer *eb)
@@ -3863,6 +3828,14 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	}
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+	struct extent_buffer *eb =
+			container_of(head, struct extent_buffer, rcu_head);
+
+	btrfs_release_extent_buffer(eb);
+}
+
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
 	u64 start = page_offset(page);
@@ -3870,23 +3843,30 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 	int ret = 1;
 
 	spin_lock(&tree->buffer_lock);
-	eb = buffer_search(tree, start);
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (!eb)
 		goto out;
 
-	if (atomic_read(&eb->refs) > 1) {
+	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
 		ret = 0;
 		goto out;
 	}
-	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+
+	/*
+	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
+	 * Or go back.
+	 */
+	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
 		ret = 0;
 		goto out;
 	}
 
-	rb_erase(&eb->rb_node, &tree->buffer);
-	/* at this point we can safely release the extent buffer */
-	btrfs_release_extent_buffer(eb);
+	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 out:
 	spin_unlock(&tree->buffer_lock);
+
+	/* at this point we can safely release the extent buffer */
+	if (atomic_read(&eb->refs) == 0)
+		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590da..1c6d4f342ef7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -85,7 +85,7 @@ struct extent_io_ops {
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct rb_root buffer;
+	struct radix_tree_root buffer;
 	struct address_space *mapping;
 	u64 dirty_bytes;
 	spinlock_t lock;
@@ -123,7 +123,7 @@ struct extent_buffer {
 	unsigned long bflags;
 	atomic_t refs;
 	struct list_head leak_list;
-	struct rb_node rb_node;
+	struct rcu_head rcu_head;
 
 	/* the spinlock is used to protect most operations */
 	spinlock_t lock;
-- 
cgit v1.2.3


From 18e503d695ff8ff9a43768555aa74575bf6b77f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 28 Oct 2010 15:30:42 -0400
Subject: Btrfs: fix raid code for removing missing drives

When btrfs is mounted in degraded mode, it has some internal structures
to track the missing devices.  This missing device is setup as readonly,
but the mapping code can get upset when we try to write to it.

This changes the mapping code to return -EIO instead of oops when we try
to write to the readonly device.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..28681e729b1d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3034,8 +3034,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
-		BUG_ON(rw == WRITE && !dev->writeable);
-		if (dev && dev->bdev) {
+		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
-- 
cgit v1.2.3


From 48b512e6857139393cdfce26348c362b87537018 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Tue, 5 Oct 2010 18:53:45 +0200
Subject: ecryptfs: call vfs_setxattr() in ecryptfs_setxattr()

Ecryptfs is a stackable filesystem which relies on lower filesystems the
ability of setting/getting extended attributes.

If there is a security module enabled on the system it updates the
'security' field of inodes according to the owned extended attribute set
with the function vfs_setxattr().  When this function is performed on a
ecryptfs filesystem the 'security' field is not updated for the lower
filesystem since the call security_inode_post_setxattr() is missing for
the lower inode.
Further, the call security_inode_setxattr() is missing for the lower inode,
leading to policy violations in the security module because specific
checks for this hook are not performed (i. e. filesystem
'associate' permission on SELinux is not checked for the lower filesystem).

This patch replaces the call of the setxattr() method of the lower inode
in the function ecryptfs_setxattr() with vfs_setxattr().

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Cc: stable <stable@kernel.org>
Cc: Dustin Kirkland <kirkland@canonical.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3fbc94203380..63e6ec0e8b50 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
@@ -1108,10 +1109,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&lower_dentry->d_inode->i_mutex);
-	rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
-						   size, flags);
-	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+
+	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 2e21b3f124eceb6ab5a07c8a061adce14ac94e14 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 23 Sep 2010 02:35:04 -0500
Subject: eCryptfs: Clear LOOKUP_OPEN flag when creating lower file

eCryptfs was passing the LOOKUP_OPEN flag through to the lower file
system, even though ecryptfs_create() doesn't support the flag. A valid
filp for the lower filesystem could be returned in the nameidata if the
lower file system's create() function supported LOOKUP_OPEN, possibly
resulting in unencrypted writes to the lower file.

However, this is only a potential problem in filesystems (FUSE, NFS,
CIFS, CEPH, 9p) that eCryptfs isn't known to support today.

https://bugs.launchpad.net/ecryptfs/+bug/641703

Reported-by: Kevin Buhr
Cc: stable <stable@kernel.org>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 63e6ec0e8b50..9d1a22d62765 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -71,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
 	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	struct dentry *dentry_save;
 	struct vfsmount *vfsmount_save;
+	unsigned int flags_save;
 	int rc;
 
 	dentry_save = nd->path.dentry;
 	vfsmount_save = nd->path.mnt;
+	flags_save = nd->flags;
 	nd->path.dentry = lower_dentry;
 	nd->path.mnt = lower_mnt;
+	nd->flags &= ~LOOKUP_OPEN;
 	rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
 	nd->path.dentry = dentry_save;
 	nd->path.mnt = vfsmount_save;
+	nd->flags = flags_save;
 	return rc;
 }
 
-- 
cgit v1.2.3


From aee683b9e77e17237b0e146025c3d363c9203634 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 6 Oct 2010 18:31:06 +0200
Subject: ecryptfs: release keys loaded in ecryptfs_keyring_auth_tok_for_sig()

This patch allows keys requested in the function
ecryptfs_keyring_auth_tok_for_sig()to be released when they are no
longer required. In particular keys are directly released in the same
function if the obtained authentication token is not valid.

Further, a new function parameter 'auth_tok_key' has been added to
ecryptfs_find_auth_tok_for_sig() in order to provide callers the key
pointer to be passed to key_put().

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: James Morris <jmorris@namei.org>
[Tyler: Initialize auth_tok_key to NULL in ecryptfs_parse_packet_set]
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/keystore.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 73811cfa2ea4..b85c6a7770a8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -446,6 +446,7 @@ out:
  */
 static int
 ecryptfs_find_auth_tok_for_sig(
+	struct key **auth_tok_key,
 	struct ecryptfs_auth_tok **auth_tok,
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 	char *sig)
@@ -453,12 +454,12 @@ ecryptfs_find_auth_tok_for_sig(
 	struct ecryptfs_global_auth_tok *global_auth_tok;
 	int rc = 0;
 
+	(*auth_tok_key) = NULL;
 	(*auth_tok) = NULL;
 	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
 						  mount_crypt_stat, sig)) {
-		struct key *auth_tok_key;
 
-		rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+		rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
 						       sig);
 	} else
 		(*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,6 +510,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 			     char *filename, size_t filename_size)
 {
 	struct ecryptfs_write_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
 	int rc = 0;
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
@@ -606,6 +608,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	}
 	dest[s->i++] = s->cipher_code;
 	rc = ecryptfs_find_auth_tok_for_sig(
+		&auth_tok_key,
 		&s->auth_tok, mount_crypt_stat,
 		mount_crypt_stat->global_default_fnek_sig);
 	if (rc) {
@@ -753,6 +756,8 @@ out_free_unlock:
 out_unlock:
 	mutex_unlock(s->tfm_mutex);
 out:
+	if (auth_tok_key)
+		key_put(auth_tok_key);
 	kfree(s);
 	return rc;
 }
@@ -798,6 +803,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 			     char *data, size_t max_packet_size)
 {
 	struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
 	int rc = 0;
 
 	(*packet_size) = 0;
@@ -910,7 +916,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	 * >= ECRYPTFS_MAX_IV_BYTES. */
 	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
 	s->desc.info = s->iv;
-	rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+	rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					    &s->auth_tok, mount_crypt_stat,
 					    s->fnek_sig_hex);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -986,6 +993,8 @@ out:
 		(*filename_size) = 0;
 		(*filename) = NULL;
 	}
+	if (auth_tok_key)
+		key_put(auth_tok_key);
 	kfree(s);
 	return rc;
 }
@@ -1557,14 +1566,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 		       ECRYPTFS_VERSION_MAJOR,
 		       ECRYPTFS_VERSION_MINOR);
 		rc = -EINVAL;
-		goto out;
+		goto out_release_key;
 	}
 	if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
 	    && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
 		printk(KERN_ERR "Invalid auth_tok structure "
 		       "returned from key query\n");
 		rc = -EINVAL;
-		goto out;
+		goto out_release_key;
+	}
+out_release_key:
+	if (rc) {
+		key_put(*auth_tok_key);
+		(*auth_tok_key) = NULL;
 	}
 out:
 	return rc;
@@ -1688,6 +1702,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
 	size_t tag_11_contents_size;
 	size_t tag_11_packet_size;
+	struct key *auth_tok_key = NULL;
 	int rc = 0;
 
 	INIT_LIST_HEAD(&auth_tok_list);
@@ -1784,6 +1799,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 	 * just one will be sufficient to decrypt to get the FEK. */
 find_next_matching_auth_tok:
 	found_auth_tok = 0;
+	if (auth_tok_key) {
+		key_put(auth_tok_key);
+		auth_tok_key = NULL;
+	}
 	list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
 		candidate_auth_tok = &auth_tok_list_item->auth_tok;
 		if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1800,7 +1819,8 @@ find_next_matching_auth_tok:
 			rc = -EINVAL;
 			goto out_wipe_list;
 		}
-		ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+		ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					       &matching_auth_tok,
 					       crypt_stat->mount_crypt_stat,
 					       candidate_auth_tok_sig);
 		if (matching_auth_tok) {
@@ -1866,6 +1886,8 @@ found_matching_auth_tok:
 out_wipe_list:
 	wipe_auth_tok_list(&auth_tok_list);
 out:
+	if (auth_tok_key)
+		key_put(auth_tok_key);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 39fac853a758306285404368fbe392408057b136 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 6 Oct 2010 18:31:15 +0200
Subject: ecryptfs: checking return code of ecryptfs_find_auth_tok_for_sig()

This patch replaces the check of the 'matching_auth_tok' pointer with
the exit status of ecryptfs_find_auth_tok_for_sig().
This avoids to use authentication tokens obtained through the function
ecryptfs_keyring_auth_tok_for_sig which are not valid.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/keystore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b85c6a7770a8..e7f029f00c6b 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1819,11 +1819,11 @@ find_next_matching_auth_tok:
 			rc = -EINVAL;
 			goto out_wipe_list;
 		}
-		ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+		rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
 					       &matching_auth_tok,
 					       crypt_stat->mount_crypt_stat,
 					       candidate_auth_tok_sig);
-		if (matching_auth_tok) {
+		if (!rc) {
 			found_auth_tok = 1;
 			goto found_matching_auth_tok;
 		}
-- 
cgit v1.2.3


From f16feb5119a87f5e683be7e8916c060abfb0e8d6 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@polito.it>
Date: Wed, 6 Oct 2010 18:31:32 +0200
Subject: ecryptfs: added ecryptfs_mount_auth_tok_only mount parameter

This patch adds a new mount parameter 'ecryptfs_mount_auth_tok_only' to
force ecryptfs to use only authentication tokens which signature has
been specified at mount time with parameters 'ecryptfs_sig' and
'ecryptfs_fnek_sig'. In this way, after disabling the passthrough and
the encrypted view modes, it's possible to make available to users only
files encrypted with the specified authentication token.

Signed-off-by: Roberto Sassu <roberto.sassu@polito.it>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: James Morris <jmorris@namei.org>
[Tyler: Clean up coding style errors found by checkpatch]
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/ecryptfs_kernel.h | 1 +
 fs/ecryptfs/keystore.c        | 9 +++++++++
 fs/ecryptfs/main.c            | 8 +++++++-
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..59ab793fc01b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
 	u32 flags;
 	struct list_head global_auth_tok_list;
 	struct mutex global_auth_tok_list_mutex;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e7f029f00c6b..b1f6858a5223 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -459,6 +459,15 @@ ecryptfs_find_auth_tok_for_sig(
 	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
 						  mount_crypt_stat, sig)) {
 
+		/* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+		 * mount_crypt_stat structure, we prevent to use auth toks that
+		 * are not inserted through the ecryptfs_add_global_auth_tok
+		 * function.
+		 */
+		if (mount_crypt_stat->flags
+				& ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+			return -EINVAL;
+
 		rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
 						       sig);
 	} else
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..09eb5296aebb 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
        ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
        ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_err };
 
 static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +224,7 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
 	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
 	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+	{ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 		case ecryptfs_opt_unlink_sigs:
 			mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
 			break;
+		case ecryptfs_opt_mount_auth_tok_only:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+			break;
 		case ecryptfs_opt_err:
 		default:
 			printk(KERN_WARNING
-- 
cgit v1.2.3


From 8747f954817212b4623f9067d4909cbde04b4d89 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Fri, 15 Oct 2010 16:43:41 -0500
Subject: eCryptfs: Print mount_auth_tok_only param in ecryptfs_show_options

When printing mount options, print the new ecryptfs_mount_auth_tok_only
mount option.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f7fc286a3aa9..253732382d37 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -180,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_printf(m, ",ecryptfs_encrypted_view");
 	if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
 		seq_printf(m, ",ecryptfs_unlink_sigs");
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+		seq_printf(m, ",ecryptfs_mount_auth_tok_only");
 
 	return 0;
 }
-- 
cgit v1.2.3


From 435f49a518c78eec8e2edbbadd912737246cbe20 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 29 Oct 2010 10:36:49 -0700
Subject: readv/writev: do the same MAX_RW_COUNT truncation that read/write
 does

We used to protect against overflow, but rather than return an error, do
what read/write does, namely to limit the total size to MAX_RW_COUNT.
This is not only more consistent, but it also means that any broken
low-level read/write routine that still keeps counts in 'int' can't
break.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c        | 12 +++++------
 fs/read_write.c    | 62 +++++++++++++++++++++++++++++-------------------------
 include/linux/fs.h |  1 +
 3 files changed, 40 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 52cfeb61da77..ff66c0d7583d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -606,14 +606,14 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	/*
 	 * Single unix specification:
 	 * We should -EINVAL if an element length is not >= 0 and fitting an
-	 * ssize_t.  The total length is fitting an ssize_t
+	 * ssize_t.
 	 *
-	 * Be careful here because iov_len is a size_t not an ssize_t
+	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
+	 * no overflow possibility.
 	 */
 	tot_len = 0;
 	ret = -EINVAL;
 	for (seg = 0; seg < nr_segs; seg++) {
-		compat_ssize_t tmp = tot_len;
 		compat_uptr_t buf;
 		compat_ssize_t len;
 
@@ -624,13 +624,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
 		}
 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 			goto out;
-		tot_len += len;
-		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-			goto out;
 		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
+		if (len > MAX_RW_COUNT - tot_len)
+			len = MAX_RW_COUNT - tot_len;
+		tot_len += len;
 		iov->iov_base = compat_ptr(buf);
 		iov->iov_len = (compat_size_t) len;
 		uvector++;
diff --git a/fs/read_write.c b/fs/read_write.c
index 9cd9d148105d..431a0ed610c8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -243,8 +243,6 @@ bad:
  * them to something that fits in "int" so that others
  * won't have to do range checks all the time.
  */
-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
-
 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 {
 	struct inode *inode;
@@ -584,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
 			      struct iovec **ret_pointer)
-  {
+{
 	unsigned long seg;
-  	ssize_t ret;
+	ssize_t ret;
 	struct iovec *iov = fast_pointer;
 
-  	/*
-  	 * SuS says "The readv() function *may* fail if the iovcnt argument
-  	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-  	 * traditionally returned zero for zero segments, so...
-  	 */
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
 	if (nr_segs == 0) {
 		ret = 0;
-  		goto out;
+		goto out;
 	}
 
-  	/*
-  	 * First get the "struct iovec" from user memory and
-  	 * verify all the pointers
-  	 */
+	/*
+	 * First get the "struct iovec" from user memory and
+	 * verify all the pointers
+	 */
 	if (nr_segs > UIO_MAXIOV) {
 		ret = -EINVAL;
-  		goto out;
+		goto out;
 	}
 	if (nr_segs > fast_segs) {
-  		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 		if (iov == NULL) {
 			ret = -ENOMEM;
-  			goto out;
+			goto out;
 		}
-  	}
+	}
 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 		ret = -EFAULT;
-  		goto out;
+		goto out;
 	}
 
-  	/*
+	/*
 	 * According to the Single Unix Specification we should return EINVAL
 	 * if an element length is < 0 when cast to ssize_t or if the
 	 * total length would overflow the ssize_t return value of the
 	 * system call.
-  	 */
+	 *
+	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+	 * overflow case.
+	 */
 	ret = 0;
-  	for (seg = 0; seg < nr_segs; seg++) {
-  		void __user *buf = iov[seg].iov_base;
-  		ssize_t len = (ssize_t)iov[seg].iov_len;
+	for (seg = 0; seg < nr_segs; seg++) {
+		void __user *buf = iov[seg].iov_base;
+		ssize_t len = (ssize_t)iov[seg].iov_len;
 
 		/* see if we we're about to use an invalid len or if
 		 * it's about to overflow ssize_t */
-		if (len < 0 || (ret + len < ret)) {
+		if (len < 0) {
 			ret = -EINVAL;
-  			goto out;
+			goto out;
 		}
 		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 			ret = -EFAULT;
-  			goto out;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - ret) {
+			len = MAX_RW_COUNT - ret;
+			iov[seg].iov_len = len;
 		}
-
 		ret += len;
-  	}
+	}
 out:
 	*ret_pointer = iov;
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4d07902bc50c..7b7b507ffa1c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1867,6 +1867,7 @@ extern int current_umask(void);
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
+#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
 extern int rw_verify_area(int, struct file *, loff_t *, size_t);
 
 #define FLOCK_VERIFY_READ  1
-- 
cgit v1.2.3


From 2354d08fe9aeec3e451b85cb5387a6b28dbca0b1 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 29 Oct 2010 15:14:18 -0400
Subject: Btrfs: use memdup_user helpers

Use memdup_user when user data is immediately copied into the
allocated region.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression from,to,size,flag;
position p;
identifier l1,l2;
@@

-  to = \(kmalloc@p\|kzalloc@p\)(size,flag);
+  to = memdup_user(from,size);
   if (
-      to==NULL
+      IS_ERR(to)
                 || ...) {
   <+... when != goto l1;
-  -ENOMEM
+  PTR_ERR(to)
   ...+>
   }
-  if (copy_from_user(to, from, size) != 0) {
-    <+... when != goto l2;
-    -EFAULT
-    ...+>
-  }
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index db0b8fc59235..8079ebfeaf50 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1073,14 +1073,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	args = kmalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return -ENOMEM;
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
 
-	if (copy_from_user(args, argp, sizeof(*args))) {
-		kfree(args);
-		return -EFAULT;
-	}
 	inode = fdentry(file)->d_inode;
 	ret = search_ioctl(inode, args);
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1188,14 +1184,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	args = kmalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return -ENOMEM;
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
 
-	if (copy_from_user(args, argp, sizeof(*args))) {
-		kfree(args);
-		return -EFAULT;
-	}
 	inode = fdentry(file)->d_inode;
 
 	if (args->treeid == 0)
-- 
cgit v1.2.3


From d0b678cb0a26783ab7238784f1e7e608e5caafa3 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 29 Oct 2010 15:14:23 -0400
Subject: Btrfs: Use ERR_CAST helpers

Use ERR_CAST(x) rather than ERR_PTR(PTR_ERR(x)).  The former makes more
clear what is the purpose of the operation, which otherwise looks like a
no-op.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
type T;
T x;
identifier f;
@@

T f (...) { <+...
- ERR_PTR(PTR_ERR(x))
+ x
 ...+> }

@@
expression x;
@@

- ERR_PTR(PTR_ERR(x))
+ ERR_CAST(x)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 4 ++--
 fs/btrfs/super.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d6451..23cb8da3ff66 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 	}
 	if (IS_ERR(rb_node)) {
-		em = ERR_PTR(PTR_ERR(rb_node));
+		em = ERR_CAST(rb_node);
 		goto out;
 	}
 	em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 	}
 	if (IS_ERR(rb_node)) {
-		em = ERR_PTR(PTR_ERR(rb_node));
+		em = ERR_CAST(rb_node);
 		goto out;
 	}
 	em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 65b62daa3f80..d7fb2733d028 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -389,7 +389,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 find_root:
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
 	if (IS_ERR(new_root))
-		return ERR_PTR(PTR_ERR(new_root));
+		return ERR_CAST(new_root);
 
 	if (btrfs_root_refs(&new_root->root_item) == 0)
 		return ERR_PTR(-ENOENT);
-- 
cgit v1.2.3


From 411fc6bcef54f828a5458f4730c68abdf13c6bf0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 29 Oct 2010 15:14:31 -0400
Subject: Btrfs: Fix variables set but not read (bugs found by gcc 4.6)

These are all the cases where a variable is set, but not
read which are really bugs.

- Couple of incorrect error handling fixed.
- One incorrect use of a allocation policy
- Some other things

Still needs more review.

Found by gcc 4.6's new warnings.

[akpm@linux-foundation.org: fix build.  Might have been bitrot]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/dir-item.c   | 2 +-
 fs/btrfs/extent_io.c  | 2 ++
 fs/btrfs/inode.c      | 6 +++---
 fs/btrfs/relocation.c | 4 +++-
 fs/btrfs/tree-log.c   | 2 +-
 5 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa49..f0cad5ae5be7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		ret = btrfs_truncate_item(trans, root, path,
 					  item_len - sub_item_len, 1);
 	}
-	return 0;
+	return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c639e156296..7dc31c39ca59 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2779,6 +2779,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
 					 NULL, 1,
 					 end_bio_extent_preparewrite, 0,
 					 0, 0);
+			if (ret && !err)
+				err = ret;
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9f08136b10c4..0aa24717cd58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1389,7 +1389,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 
 	if (map_length < length + size)
 		return 1;
-	return 0;
+	return ret;
 }
 
 /*
@@ -2709,8 +2709,8 @@ static int check_path_shared(struct btrfs_root *root,
 {
 	struct extent_buffer *eb;
 	int level;
-	int ret;
 	u64 refs = 1;
+	int uninitialized_var(ret);
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 		if (!path->nodes[level])
@@ -2723,7 +2723,7 @@ static int check_path_shared(struct btrfs_root *root,
 		if (refs > 1)
 			return 1;
 	}
-	return 0;
+	return ret; /* XXX callers? */
 }
 
 /*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fd0714475db7..045c9c2b2d7e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3094,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 		ret = get_ref_objectid_v0(rc, path, extent_key,
 					  &ref_owner, NULL);
+		if (ret < 0)
+			return ret;
 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
 		level = (int)ref_owner;
 		/* FIXME: get real generation */
@@ -4218,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}
 	btrfs_put_ordered_extent(ordered);
-	return 0;
+	return ret;
 }
 
 void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9c..224fb5b3daad 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2273,7 +2273,7 @@ fail:
 	}
 	btrfs_end_log_trans(root);
 
-	return 0;
+	return err;
 }
 
 /* see comments for btrfs_del_dir_entries_in_log */
-- 
cgit v1.2.3


From 559af8211433b8c0b20e6c43c61409cb9c9c2996 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 29 Oct 2010 15:14:37 -0400
Subject: Btrfs: cleanup warnings from gcc 4.6 (nonbugs)

These are all the cases where a variable is set, but not read which are
not bugs as far as I can see, but simply leftovers.

Still needs more review.

Found by gcc 4.6's new warnings

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c  |  2 --
 fs/btrfs/ctree.c        | 20 ++------------------
 fs/btrfs/disk-io.c      | 11 -----------
 fs/btrfs/extent-tree.c  |  2 --
 fs/btrfs/extent_io.c    |  9 ---------
 fs/btrfs/inode.c        | 14 --------------
 fs/btrfs/ioctl.c        |  2 --
 fs/btrfs/ordered-data.c |  2 --
 fs/btrfs/root-tree.c    |  2 --
 fs/btrfs/super.c        |  6 ++----
 fs/btrfs/tree-defrag.c  |  2 --
 fs/btrfs/tree-log.c     | 15 ---------------
 fs/btrfs/volumes.c      |  4 ----
 fs/btrfs/xattr.c        |  2 --
 fs/btrfs/zlib.c         |  5 -----
 15 files changed, 4 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a2..7845d1f7d1d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -163,7 +163,6 @@ fail:
  */
 static void end_compressed_bio_read(struct bio *bio, int err)
 {
-	struct extent_io_tree *tree;
 	struct compressed_bio *cb = bio->bi_private;
 	struct inode *inode;
 	struct page *page;
@@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	tree = &BTRFS_I(inode)->io_tree;
 	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
 					cb->start,
 					cb->orig_bio->bi_io_vec,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6921231e0efb..9ac171599258 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
 {
 	struct extent_buffer *cow;
-	u32 nritems;
 	int ret = 0;
 	int level;
 	struct btrfs_disk_key disk_key;
@@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
-	nritems = btrfs_header_nritems(buf);
 	if (level == 0)
 		btrfs_item_key(buf, &disk_key, 0);
 	else
@@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	int wret;
 	int pslot;
 	int orig_slot = path->slots[level];
-	int err_on_enospc = 0;
 	u64 orig_ptr;
 
 	if (level == 0)
@@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (btrfs_header_nritems(mid) < 2)
-		err_on_enospc = 1;
+	btrfs_header_nritems(mid);
 
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
@@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		wret = push_node_left(trans, root, left, mid, 1);
 		if (wret < 0)
 			ret = wret;
-		if (btrfs_header_nritems(mid) < 2)
-			err_on_enospc = 1;
+		btrfs_header_nritems(mid);
 	}
 
 	/*
@@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	int wret;
 	int pslot;
 	int orig_slot = path->slots[level];
-	u64 orig_ptr;
 
 	if (level == 0)
 		return 1;
 
 	mid = path->nodes[level];
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
-	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
 		parent = path->nodes[level + 1];
@@ -2567,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
-	int slot;
 	int i;
 	int push_space = 0;
 	int push_items = 0;
@@ -2579,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	u32 this_item_size;
 	u32 old_left_item_size;
 
-	slot = path->slots[1];
-
 	if (empty)
 		nr = min(right_nritems, max_slot);
 	else
@@ -3349,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	int slot;
-	int slot_orig;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
 	u32 nritems;
@@ -3359,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	unsigned int size_diff;
 	int i;
 
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
@@ -3464,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	int slot;
-	int slot_orig;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
 	u32 nritems;
@@ -3473,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	unsigned int old_size;
 	int i;
 
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
 
 	nritems = btrfs_header_nritems(leaf);
@@ -3806,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_key *cpu_key, u32 *data_size,
 			    int nr)
 {
-	struct extent_buffer *leaf;
 	int ret = 0;
 	int slot;
 	int i;
@@ -3823,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 
-	leaf = path->nodes[0];
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 77e5dabfd45a..e163424c7fce 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 found_start;
-	int found_level;
 	unsigned long len;
 	struct extent_buffer *eb;
 	int ret;
@@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 		goto err;
 	}
-	found_level = btrfs_header_level(eb);
-
 	csum_tree_block(root, eb, 0);
 err:
 	free_extent_buffer(eb);
@@ -543,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 
 static void run_one_async_start(struct btrfs_work *work)
 {
-	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	fs_info = BTRFS_I(async->inode)->root->fs_info;
 	async->submit_bio_start(async->inode, async->rw, async->bio,
 			       async->mirror_num, async->bio_flags,
 			       async->bio_offset);
@@ -860,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_io_tree *io_tree;
 	int ret;
 
-	io_tree = &BTRFS_I(btree_inode)->io_tree;
-
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
@@ -1387,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio)
 	u64 start = 0;
 	struct page *page;
 	struct extent_io_tree *io_tree = NULL;
-	struct btrfs_fs_info *info = NULL;
 	struct bio_vec *bvec;
 	int i;
 	int ret;
@@ -1406,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio)
 		buf_len = page->private >> 2;
 		start = page_offset(page) + bvec->bv_offset;
 		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-		info = BTRFS_I(page->mapping->host)->root->fs_info;
 	}
 	/* are we fully contained in this bio? */
 	if (buf_len <= length)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 59c8daaacf0c..df754108b952 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5719,7 +5719,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 	u64 generation;
 	u64 refs;
 	u64 flags;
-	u64 last = 0;
 	u32 nritems;
 	u32 blocksize;
 	struct btrfs_key key;
@@ -5787,7 +5786,6 @@ reada:
 					   generation);
 		if (ret)
 			break;
-		last = bytenr + blocksize;
 		nread++;
 	}
 	wc->reada_slot = slot;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7dc31c39ca59..3b7eaee0f912 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1857,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
 	u64 start;
-	u64 end;
 
 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
-	end = start + bvec->bv_len - 1;
 
 	bio->bi_private = NULL;
 
@@ -2160,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
-	u64 unlock_start;
 	sector_t sector;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
@@ -2285,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
-		unlock_start = page_end + 1;
 		goto done;
 	}
 
@@ -2296,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
-			unlock_start = page_end + 1;
 			break;
 		}
 		em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2343,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			cur += iosize;
 			pg_offset += iosize;
-			unlock_start = cur;
 			continue;
 		}
 		/* leave this out until we have a page_mkwrite call */
@@ -2429,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int range_whole = 0;
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
@@ -2438,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
 		scanned = 1;
 	}
 retry:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0aa24717cd58..609f3bbbd1ed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 num_bytes;
-	u64 orig_start;
-	u64 disk_num_bytes;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
 	u64 isize = i_size_read(inode);
@@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode,
 	int i;
 	int will_compress;
 
-	orig_start = start;
-
 	actual_end = min_t(u64, isize, end + 1);
 again:
 	will_compress = 0;
@@ -371,7 +367,6 @@ again:
 	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	disk_num_bytes = num_bytes;
 	total_in = 0;
 	ret = 0;
 
@@ -467,7 +462,6 @@ again:
 		if (total_compressed >= total_in) {
 			will_compress = 0;
 		} else {
-			disk_num_bytes = total_compressed;
 			num_bytes = total_in;
 		}
 	}
@@ -757,8 +751,6 @@ static noinline int cow_file_range(struct inode *inode,
 	u64 disk_num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 actual_end;
-	u64 isize = i_size_read(inode);
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -770,8 +762,6 @@ static noinline int cow_file_range(struct inode *inode,
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
-	actual_end = min_t(u64, isize, end + 1);
-
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
 	disk_num_bytes = num_bytes;
@@ -2274,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	struct btrfs_key key, found_key;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode;
@@ -2312,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 
 		/* pull out the item */
 		leaf = path->nodes[0];
-		item = btrfs_item_nr(leaf, path->slots[0]);
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
 		/* make sure the item matches what we want */
@@ -5701,7 +5689,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_private *dip;
 	struct bio_vec *bvec = bio->bi_io_vec;
-	u64 start;
 	int skip_sum;
 	int write = rw & REQ_WRITE;
 	int ret = 0;
@@ -5727,7 +5714,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
 
-	start = dip->logical_offset;
 	dip->bytes = 0;
 	do {
 		dip->bytes += bvec->bv_len;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8079ebfeaf50..60f662c4778b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -708,7 +708,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	char *sizestr;
 	char *devstr = NULL;
 	int ret = 0;
-	int namelen;
 	int mod = 0;
 
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +721,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	namelen = strlen(vol_args->name);
 
 	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5add..f4621f6deca1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	u64 end;
 	u64 orig_end;
-	u64 wait_end;
 	struct btrfs_ordered_extent *ordered;
 	int found;
 
@@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		if (orig_end > INT_LIMIT(loff_t))
 			orig_end = INT_LIMIT(loff_t);
 	}
-	wait_end = orig_end;
 again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c8..6a1086e83ffc 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_root *dead_root;
-	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
 			nritems = btrfs_header_nritems(leaf);
 			slot = path->slots[0];
 		}
-		item = btrfs_item_nr(leaf, slot);
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
 			goto next;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d7fb2733d028..0002e6d1a16f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb)
 
 	ret = close_ctree(root);
 	sb->s_fs_info = NULL;
+
+	(void)ret; /* FIXME: need to fix VFS to return error? */
 }
 
 enum {
@@ -445,7 +447,6 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
 	struct btrfs_key key;
 	int err;
@@ -467,7 +468,6 @@ static int btrfs_fill_super(struct super_block *sb,
 		return PTR_ERR(tree_root);
 	}
 	sb->s_fs_info = tree_root;
-	disk_super = &tree_root->fs_info->super_copy;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
@@ -580,7 +580,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	char *subvol_name = NULL;
 	u64 subvol_objectid = 0;
 	int error = 0;
-	int found = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
@@ -616,7 +615,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto error_close_devices;
 		}
 
-		found = 1;
 		btrfs_close_devices(fs_devices);
 	} else {
 		char b[BDEVNAME_SIZE];
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed7..992ab425599d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int wret;
 	int level;
-	int orig_level;
 	int is_extent = 0;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	level = btrfs_header_level(root->node);
-	orig_level = level;
 
 	if (level == 0)
 		goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 224fb5b3daad..a29f19384a27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 {
 	struct inode *dir;
 	int ret;
-	struct btrfs_key location;
 	struct btrfs_inode_ref *ref;
 	struct btrfs_dir_item *di;
 	struct inode *inode;
@@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	unsigned long ref_ptr;
 	unsigned long ref_end;
 
-	location.objectid = key->objectid;
-	location.type = BTRFS_INODE_ITEM_KEY;
-	location.offset = 0;
-
 	/*
 	 * it is possible that we didn't log all the parent directories
 	 * for a given inode.  If we don't find the dir, just don't
@@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	struct btrfs_path *path;
 	struct btrfs_root *root = wc->replay_dest;
 	struct btrfs_key key;
-	u32 item_size;
 	int level;
 	int i;
 	int ret;
@@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	nritems = btrfs_header_nritems(eb);
 	for (i = 0; i < nritems; i++) {
 		btrfs_item_key_to_cpu(eb, &key, i);
-		item_size = btrfs_item_size_nr(eb, i);
 
 		/* inode keys are done during the first stage */
 		if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct walk_control *wc)
 {
 	u64 root_owner;
-	u64 root_gen;
 	u64 bytenr;
 	u64 ptr_gen;
 	struct extent_buffer *next;
@@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 		parent = path->nodes[*level];
 		root_owner = btrfs_header_owner(parent);
-		root_gen = btrfs_header_generation(parent);
 
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
 
@@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct walk_control *wc)
 {
 	u64 root_owner;
-	u64 root_gen;
 	int i;
 	int slot;
 	int ret;
@@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-			struct extent_buffer *node;
-			node = path->nodes[i];
 			path->slots[i]++;
 			*level = i;
 			WARN_ON(*level == 0);
@@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				parent = path->nodes[*level + 1];
 
 			root_owner = btrfs_header_owner(parent);
-			root_gen = btrfs_header_generation(parent);
 			wc->process_func(root, path->nodes[*level], wc,
 				 btrfs_header_generation(path->nodes[*level]));
 			if (wc->free) {
@@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
-	u32 size;
 	int err = 0;
 	int ret;
 	int nritems;
@@ -2793,7 +2779,6 @@ again:
 			break;
 
 		src = path->nodes[0];
-		size = btrfs_item_size_nr(src, path->slots[0]);
 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
 			ins_nr++;
 			goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 28681e729b1d..91851b555e2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1901,7 +1901,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	u64 size_to_free;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_chunk *chunk;
 	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
@@ -1965,9 +1964,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		if (found_key.objectid != key.objectid)
 			break;
 
-		chunk = btrfs_item_ptr(path->nodes[0],
-				       path->slots[0],
-				       struct btrfs_chunk);
 		/* chunk zero is special */
 		if (found_key.offset == 0)
 			break;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb215878..698fdd2c739c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
-	struct btrfs_item *item;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	int ret = 0, slot, advance;
@@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		}
 		advance = 1;
 
-		item = btrfs_item_nr(leaf, slot);
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		/* check to make sure this item is what we want */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa239..b9cd5445f71c 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	int nr_pages = 0;
 	struct page *in_page = NULL;
 	struct page *out_page = NULL;
-	int out_written = 0;
-	int in_read = 0;
 	unsigned long bytes_left;
 
 	*out_pages = 0;
@@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
 	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
 
-	out_written = 0;
-	in_read = 0;
-
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
-- 
cgit v1.2.3


From d8e39c457bc1ca2a7304bc086c7b0f0c10854921 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 29 Oct 2010 15:17:41 -0400
Subject: Btrfs: drop unused variable in block_alloc_rsv

The alloc_target variable is not really used.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df754108b952..a541bc87f04c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3607,18 +3607,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	u64 alloc_target;
 
 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
 	if (!block_rsv)
 		return NULL;
 
 	btrfs_init_block_rsv(block_rsv);
-
-	alloc_target = btrfs_get_alloc_profile(root, 0);
 	block_rsv->space_info = __find_space_info(fs_info,
 						  BTRFS_BLOCK_GROUP_METADATA);
-
 	return block_rsv;
 }
 
-- 
cgit v1.2.3


From 9a019196ecaa57780141ef5d1f0bb31050d6ed5b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:37:33 -0400
Subject: Btrfs: fix delalloc checks in clone ioctl

The lookup_first_ordered_extent() was done on the wrong inode, and the
->delalloc_bytes test was wrong, as the following
btrfs_wait_ordered_range() would only invoke a range write and wouldn't
write the entire file data range. Also, a bad parameter was passed to
btrfs_wait_ordered_range().

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 60f662c4778b..d94bef5179fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1520,13 +1520,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
-		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+		if (!ordered &&
+		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+				   EXTENT_DELALLOC, 0, NULL))
 			break;
 		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
-		btrfs_wait_ordered_range(src, off, off+len);
+		btrfs_wait_ordered_range(src, off, len);
 	}
 
 	/* clone data */
-- 
cgit v1.2.3


From 050006a753bab8ba05f2113cc57ba49398cd5521 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:37:33 -0400
Subject: Btrfs: fix clone ioctl where range is adjacent to extent

We had an edge case issue where the requested range was just
following an existing extent. Instead of skipping to the next
extent, we used the previous one which lead to having zero
sized extents.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d94bef5179fc..3fe15e435b5c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1597,7 +1597,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			}
 			btrfs_release_path(root, path);
 
-			if (key.offset + datal < off ||
+			if (key.offset + datal <= off ||
 			    key.offset >= off+len)
 				goto next;
 
-- 
cgit v1.2.3


From fccdae435c1b295cca546f23f6f43126a28ffac3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:37:33 -0400
Subject: Btrfs: fix lockdep warning on clone ioctl

I'm no lockdep expert, but this appears to make the lockdep warning go
away for the i_mutex locking in the clone ioctl.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3fe15e435b5c..93d69b32028e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1492,11 +1492,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	path->reada = 2;
 
 	if (inode < src) {
-		mutex_lock(&inode->i_mutex);
-		mutex_lock(&src->i_mutex);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
 	} else {
-		mutex_lock(&src->i_mutex);
-		mutex_lock(&inode->i_mutex);
+		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 	}
 
 	/* determine range to clone */
-- 
cgit v1.2.3


From 99d16cbcaf694c803a1b6bf7e851694ffe1d255d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:37:34 -0400
Subject: Btrfs: fix deadlock in btrfs_commit_transaction

We calculate timeout (either 1 or MAX_SCHEDULE_TIMEOUT) based on whether
num_writers > 1 or should_grow at the top of the loop.  Then, much much
later, we wait for that timeout if either num_writers or should_grow is
true.  However, it's possible for a racing process (calling
btrfs_end_transaction()) to decrement num_writers such that we wait
forever instead of for 1.

Fix this by deciding how long to wait when we wait.  Include a smp_mb()
before checking if the waitqueue is active to ensure the num_writers
is visible.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 325d9a5f0128..700dc4b34ada 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -402,6 +402,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	WARN_ON(cur_trans->num_writers < 1);
 	cur_trans->num_writers--;
 
+	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
 	put_transaction(cur_trans);
@@ -1010,7 +1011,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
 	unsigned long joined = 0;
-	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	DEFINE_WAIT(wait);
@@ -1081,11 +1081,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			snap_pending = 1;
 
 		WARN_ON(cur_trans != trans->transaction);
-		if (cur_trans->num_writers > 1)
-			timeout = MAX_SCHEDULE_TIMEOUT;
-		else if (should_grow)
-			timeout = 1;
-
 		mutex_unlock(&root->fs_info->trans_mutex);
 
 		if (flush_on_commit || snap_pending) {
@@ -1107,8 +1102,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				TASK_UNINTERRUPTIBLE);
 
 		smp_mb();
-		if (cur_trans->num_writers > 1 || should_grow)
-			schedule_timeout(timeout);
+		if (cur_trans->num_writers > 1)
+			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+		else if (should_grow)
+			schedule_timeout(1);
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
-- 
cgit v1.2.3


From bb9c12c945cbd1b0eaa1589546dde772ccabeeba Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:37:34 -0400
Subject: Btrfs: async transaction commit

Add support for an async transaction commit that is ordered such that any
subsequent operations will join the following transaction, but does not
wait until the current commit is fully on disk.  This avoids much of the
latency associated with the btrfs_commit_transaction for callers concerned
with serialization and not safety.

The wait_for_unblock flag controls whether we wait for the 'middle' portion
of commit_transaction to complete, which is necessary if the caller expects
some of the modifications contained in the commit to be available (this is
the case for subvol/snapshot creation).

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/disk-io.c     |   1 +
 fs/btrfs/transaction.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h |   3 ++
 4 files changed, 124 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 88c0fb7e12d2..e5d66b13c175 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -901,6 +901,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
+	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
 	struct btrfs_super_block super_copy;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e163424c7fce..b40dfe48017b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1679,6 +1679,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
+	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 700dc4b34ada..9f40bfc9c45c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1007,6 +1007,123 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 	return ret;
 }
 
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+					    struct btrfs_transaction *trans)
+{
+	DEFINE_WAIT(wait);
+
+	if (trans->in_commit)
+		return;
+
+	while (1) {
+		prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->in_commit) {
+			finish_wait(&root->fs_info->transaction_blocked_wait,
+				    &wait);
+			break;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+		finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
+	}
+}
+
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+					 struct btrfs_transaction *trans)
+{
+	DEFINE_WAIT(wait);
+
+	if (trans->commit_done || (trans->in_commit && !trans->blocked))
+		return;
+
+	while (1) {
+		prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->commit_done ||
+		    (trans->in_commit && !trans->blocked)) {
+			finish_wait(&root->fs_info->transaction_wait,
+				    &wait);
+			break;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+		finish_wait(&root->fs_info->transaction_wait,
+			    &wait);
+	}
+}
+
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+	struct btrfs_trans_handle *newtrans;
+	struct btrfs_root *root;
+	struct delayed_work work;
+};
+
+static void do_async_commit(struct work_struct *work)
+{
+	struct btrfs_async_commit *ac =
+		container_of(work, struct btrfs_async_commit, work.work);
+
+	btrfs_commit_transaction(ac->newtrans, ac->root);
+	kfree(ac);
+}
+
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock)
+{
+	struct btrfs_async_commit *ac;
+	struct btrfs_transaction *cur_trans;
+
+	ac = kmalloc(sizeof(*ac), GFP_NOFS);
+	BUG_ON(!ac);
+
+	INIT_DELAYED_WORK(&ac->work, do_async_commit);
+	ac->root = root;
+	ac->newtrans = btrfs_join_transaction(root, 0);
+
+	/* take transaction reference */
+	mutex_lock(&root->fs_info->trans_mutex);
+	cur_trans = trans->transaction;
+	cur_trans->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	btrfs_end_transaction(trans, root);
+	schedule_delayed_work(&ac->work, 0);
+
+	/* wait for transaction to start and unblock */
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (wait_for_unblock)
+		wait_current_trans_commit_start_and_unblock(root, cur_trans);
+	else
+		wait_current_trans_commit_start(root, cur_trans);
+	put_transaction(cur_trans);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	return 0;
+}
+
+/*
+ * btrfs_transaction state sequence:
+ *    in_commit = 0, blocked = 0  (initial)
+ *    in_commit = 1, blocked = 1
+ *    blocked = 0
+ *    commit_done = 1
+ */
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -1057,6 +1174,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
+	wake_up(&root->fs_info->transaction_blocked_wait);
+
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 15f83e1c1ef7..e1908e6872fe 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -108,6 +108,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 462045928bda777c86919a396a42991fcf235378 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:41:32 -0400
Subject: Btrfs: add START_SYNC, WAIT_SYNC ioctls

START_SYNC will start a sync/commit, but not wait for it to
complete.  Any modification started after the ioctl returns is
guaranteed not to be included in the commit.  If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.

WAIT_SYNC will wait for any in-progress commit to complete.  If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately.  If the specified transaction doesn't exist, it
returns EINVAL.

If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.

These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.

Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.

Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result.  However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well.  Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c       | 34 +++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h       |  2 ++
 fs/btrfs/transaction.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h |  1 +
 4 files changed, 89 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 93d69b32028e..dc5a19ed07f3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2028,6 +2028,36 @@ long btrfs_ioctl_trans_end(struct file *file)
 	return 0;
 }
 
+static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 transid;
+
+	trans = btrfs_start_transaction(root, 0);
+	transid = trans->transid;
+	btrfs_commit_transaction_async(trans, root, 0);
+
+	if (argp)
+		if (copy_to_user(argp, &transid, sizeof(transid)))
+			return -EFAULT;
+	return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+	u64 transid;
+
+	if (argp) {
+		if (copy_from_user(&transid, argp, sizeof(transid)))
+			return -EFAULT;
+	} else {
+		transid = 0;  /* current trans */
+	}
+	return btrfs_wait_for_commit(root, transid);
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2078,6 +2108,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
+	case BTRFS_IOC_START_SYNC:
+		return btrfs_ioctl_start_sync(file, argp);
+	case BTRFS_IOC_WAIT_SYNC:
+		return btrfs_ioctl_wait_sync(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517f..16e1442523b7 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -178,4 +178,6 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
 				    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9f40bfc9c45c..1fffbc017bdf 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -279,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+	struct btrfs_transaction *cur_trans = NULL, *t;
+	int ret;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+
+	ret = 0;
+	if (transid) {
+		if (transid <= root->fs_info->last_trans_committed)
+			goto out_unlock;
+
+		/* find specified transaction */
+		list_for_each_entry(t, &root->fs_info->trans_list, list) {
+			if (t->transid == transid) {
+				cur_trans = t;
+				break;
+			}
+			if (t->transid > transid)
+				break;
+		}
+		ret = -EINVAL;
+		if (!cur_trans)
+			goto out_unlock;  /* bad transid */
+	} else {
+		/* find newest transaction that is committing | committed */
+		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+					    list) {
+			if (t->in_commit) {
+				if (t->commit_done)
+					goto out_unlock;
+				cur_trans = t;
+				break;
+			}
+		}
+		if (!cur_trans)
+			goto out_unlock;  /* nothing committing|committed */
+	}
+
+	cur_trans->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	wait_for_commit(root, cur_trans);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	put_transaction(cur_trans);
+	ret = 0;
+out_unlock:
+	mutex_unlock(&root->fs_info->trans_mutex);
+	return ret;
+}
+
 #if 0
 /*
  * rate limit against the drop_snapshot code.  This helps to slow down new
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e1908e6872fe..f104b57ad4ef 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -97,6 +97,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
 							  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 							 int num_blocks);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 72fd032e94240d001b1d22f2c1dfd2592b02e44e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:41:32 -0400
Subject: Btrfs: add SNAP_CREATE_ASYNC ioctl

Create a snap without waiting for it to commit to disk.  The ioctl is
ordered such that subsequent operations will not be contained by the
created snapshot, and the commit is initiated, but the ioctl does not
wait for the snapshot to commit to disk.

We return the specific transid to userspace so that an application can wait
for this specific snapshot creation to commit via the WAIT_SYNC ioctl.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 107 ++++++++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/ioctl.h |  11 +++++-
 2 files changed, 93 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dc5a19ed07f3..e8a26a3aac3e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int create_subvol(struct btrfs_root *root,
 				  struct dentry *dentry,
-				  char *name, int namelen)
+				  char *name, int namelen,
+				  u64 *async_transid)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
@@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
-	err = btrfs_commit_transaction(trans, root);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		err = btrfs_commit_transaction_async(trans, root, 1);
+	} else {
+		err = btrfs_commit_transaction(trans, root);
+	}
 	if (err && !ret)
 		ret = err;
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+			   char *name, int namelen, u64 *async_transid)
 {
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
@@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
 
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
-	ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		ret = btrfs_commit_transaction_async(trans,
+				     root->fs_info->extent_root, 1);
+	} else {
+		ret = btrfs_commit_transaction(trans,
+					       root->fs_info->extent_root);
+	}
 	BUG_ON(ret);
 
 	ret = pending_snapshot->error;
@@ -412,7 +426,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  */
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
-				   struct btrfs_root *snap_src)
+				   struct btrfs_root *snap_src,
+				   u64 *async_transid)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
@@ -443,10 +458,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry);
+		error = create_snapshot(snap_src, dentry,
+					name, namelen, async_transid);
 	} else {
 		error = create_subvol(BTRFS_I(dir)->root, dentry,
-				      name, namelen);
+				      name, namelen, async_transid);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -799,11 +815,13 @@ out_unlock:
 	return ret;
 }
 
-static noinline int btrfs_ioctl_snap_create(struct file *file,
-					    void __user *arg, int subvol)
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+						    char *name,
+						    unsigned long fd,
+						    int subvol,
+						    u64 *transid)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
-	struct btrfs_ioctl_vol_args *vol_args;
 	struct file *src_file;
 	int namelen;
 	int ret = 0;
@@ -811,23 +829,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
-
-	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	namelen = strlen(vol_args->name);
-	if (strchr(vol_args->name, '/')) {
+	namelen = strlen(name);
+	if (strchr(name, '/')) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (subvol) {
-		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
-				     NULL);
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     NULL, transid);
 	} else {
 		struct inode *src_inode;
-		src_file = fget(vol_args->fd);
+		src_file = fget(fd);
 		if (!src_file) {
 			ret = -EINVAL;
 			goto out;
@@ -841,12 +854,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 			fput(src_file);
 			goto out;
 		}
-		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
-				     BTRFS_I(src_inode)->root);
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     BTRFS_I(src_inode)->root,
+				     transid);
 		fput(src_file);
 	}
 out:
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+					    void __user *arg, int subvol,
+					    int async)
+{
+	struct btrfs_ioctl_vol_args *vol_args = NULL;
+	struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+	char *name;
+	u64 fd;
+	u64 transid = 0;
+	int ret;
+
+	if (async) {
+		async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+		if (IS_ERR(async_vol_args))
+			return PTR_ERR(async_vol_args);
+
+		name = async_vol_args->name;
+		fd = async_vol_args->fd;
+		async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+	} else {
+		vol_args = memdup_user(arg, sizeof(*vol_args));
+		if (IS_ERR(vol_args))
+			return PTR_ERR(vol_args);
+		name = vol_args->name;
+		fd = vol_args->fd;
+		vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	}
+
+	ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+					      subvol, &transid);
+
+	if (!ret && async) {
+		if (copy_to_user(arg +
+				offsetof(struct btrfs_ioctl_async_vol_args,
+				transid), &transid, sizeof(transid)))
+			return -EFAULT;
+	}
+
 	kfree(vol_args);
+	kfree(async_vol_args);
+
 	return ret;
 }
 
@@ -2072,9 +2129,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case FS_IOC_GETVERSION:
 		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 0);
+		return btrfs_ioctl_snap_create(file, argp, 0, 0);
+	case BTRFS_IOC_SNAP_CREATE_ASYNC:
+		return btrfs_ioctl_snap_create(file, argp, 0, 1);
 	case BTRFS_IOC_SUBVOL_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 1);
+		return btrfs_ioctl_snap_create(file, argp, 1, 0);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_DEFAULT_SUBVOL:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 16e1442523b7..17c99ebdf960 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,21 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 4087
 
 /* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
 struct btrfs_ioctl_vol_args {
 	__s64 fd;
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+#define BTRFS_SNAPSHOT_NAME_MAX 4079
+struct btrfs_ioctl_async_vol_args {
+	__s64 fd;
+	__u64 transid;
+	char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
 	__u64 treeid;
@@ -180,4 +187,6 @@ struct btrfs_ioctl_space_args {
 				    struct btrfs_ioctl_space_args)
 #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+				   struct btrfs_ioctl_async_vol_args)
 #endif
-- 
cgit v1.2.3


From 531cb13f1e417c060b54f979e1659ecd69bea650 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:41:32 -0400
Subject: Btrfs: make SNAP_DESTROY async

There is no reason to force an immediate commit when deleting a snapshot.
Users have some expectation that space from a deleted snapshot be freed
immediately, but even if we do commit the reclaim is a background process.

If users _do_ want the deletion to be durable, they can call 'sync'.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e8a26a3aac3e..fdd88f2f1ece 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1351,7 +1351,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_commit_transaction(trans, root);
+	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	inode->i_flags |= S_DEAD;
 out_up_write:
-- 
cgit v1.2.3


From 4260f7c7516f4c209cf0ca34fda99cc9a0847772 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 29 Oct 2010 15:46:43 -0400
Subject: Btrfs: allow subvol deletion by unprivileged user with -o
 user_subvol_rm_allowed

Add a mount option user_subvol_rm_allowed that allows users to delete a
(potentially non-empty!) subvol when they would otherwise we allowed to do
an rmdir(2).  We duplicate the may_delete() checks from the core VFS code
to implement identical security checks (minus the directory size check).
We additionally require that the user has write+exec permission on the
subvol root inode.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |   1 +
 fs/btrfs/ioctl.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/super.c |   5 +++
 3 files changed, 116 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e5d66b13c175..8db9234f6b41 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1234,6 +1234,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
 #define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fdd88f2f1ece..463d91b4dd3a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -409,6 +409,76 @@ fail:
 	return ret;
 }
 
+/*  copy of check_sticky in fs/namei.c()
+* It's inline, so penalty for filesystems that don't use sticky bit is
+* minimal.
+*/
+static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
+{
+	uid_t fsuid = current_fsuid();
+
+	if (!(dir->i_mode & S_ISVTX))
+		return 0;
+	if (inode->i_uid == fsuid)
+		return 0;
+	if (dir->i_uid == fsuid)
+		return 0;
+	return !capable(CAP_FOWNER);
+}
+
+/*  copy of may_delete in fs/namei.c()
+ *	Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *	a. be owner of dir, or
+ *	b. be owner of victim, or
+ *	c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+	int error;
+
+	if (!victim->d_inode)
+		return -ENOENT;
+
+	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(victim, dir);
+
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+	if (IS_APPEND(dir))
+		return -EPERM;
+	if (btrfs_check_sticky(dir, victim->d_inode)||
+		IS_APPEND(victim->d_inode)||
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+		return -EPERM;
+	if (isdir) {
+		if (!S_ISDIR(victim->d_inode->i_mode))
+			return -ENOTDIR;
+		if (IS_ROOT(victim))
+			return -EBUSY;
+	} else if (S_ISDIR(victim->d_inode->i_mode))
+		return -EISDIR;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+	return 0;
+}
+
 /* copy of may_create in fs/namei.c() */
 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 {
@@ -1274,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	int ret;
 	int err = 0;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
 		return PTR_ERR(vol_args);
@@ -1306,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	}
 
 	inode = dentry->d_inode;
+	dest = BTRFS_I(inode)->root;
+	if (!capable(CAP_SYS_ADMIN)){
+		/*
+		 * Regular user.  Only allow this with a special mount
+		 * option, when the user has write+exec access to the
+		 * subvol root, and when rmdir(2) would have been
+		 * allowed.
+		 *
+		 * Note that this is _not_ check that the subvol is
+		 * empty or doesn't contain data that we wouldn't
+		 * otherwise be able to delete.
+		 *
+		 * Users who want to delete empty subvols should try
+		 * rmdir(2).
+		 */
+		err = -EPERM;
+		if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+			goto out_dput;
+
+		/*
+		 * Do not allow deletion if the parent dir is the same
+		 * as the dir to be deleted.  That means the ioctl
+		 * must be called on the dentry referencing the root
+		 * of the subvol, not a random directory contained
+		 * within it.
+		 */
+		err = -EINVAL;
+		if (root == dest)
+			goto out_dput;
+
+		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+		if (err)
+			goto out_dput;
+
+		/* check if subvolume may be deleted by a non-root user */
+		err = btrfs_may_delete(dir, dentry, 1);
+		if (err)
+			goto out_dput;
+	}
+
 	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
 		err = -EINVAL;
 		goto out_dput;
 	}
 
-	dest = BTRFS_I(inode)->root;
-
 	mutex_lock(&inode->i_mutex);
 	err = d_invalidate(dentry);
 	if (err)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0002e6d1a16f..718b10de2049 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -71,6 +71,7 @@ enum {
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
 	Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+	Opt_user_subvol_rm_allowed,
 };
 
 static match_table_t tokens = {
@@ -96,6 +97,7 @@ static match_table_t tokens = {
 	{Opt_discard, "discard"},
 	{Opt_space_cache, "space_cache"},
 	{Opt_clear_cache, "clear_cache"},
+	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
 	{Opt_err, NULL},
 };
 
@@ -246,6 +248,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
 			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
 			break;
+		case Opt_user_subvol_rm_allowed:
+			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
-- 
cgit v1.2.3


From 117bf5fbdbdc7a5394e5718b3354238961c83067 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Sep 2010 14:32:56 +0000
Subject: hpfs: Convert sbi->hpfs_creation_de to mutex

sbi->hpfs_creation_de is used as mutex so make it a mutex.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Christoph Hellwig <hch@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
LKML-Reference: <20100907125056.228874895@linutronix.de>
---
 fs/hpfs/buffer.c  | 4 ++--
 fs/hpfs/hpfs_fn.h | 2 +-
 fs/hpfs/super.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e3..793cb9d943d2 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("lock creation\n");
 #endif
-	down(&hpfs_sb(s)->hpfs_creation_de);
+	mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
 }
 
 void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("unlock creation\n");
 #endif
-	up(&hpfs_sb(s)->hpfs_creation_de);
+	mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
 }
 
 /* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b59eac0232a0..2fee17d0d9ab 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
 	unsigned *sb_bmp_dir;		/* main bitmap directory */
 	unsigned sb_c_bitmap;		/* current bitmap */
 	unsigned sb_max_fwd_alloc;	/* max forwad allocation */
-	struct semaphore hpfs_creation_de; /* when creating dirents, nobody else
+	struct mutex hpfs_creation_de;	/* when creating dirents, nobody else
 					   can alloc blocks */
 	/*unsigned sb_mounting : 1;*/
 	int sb_timeshift;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index c969a1aa163a..18e1d4566ecf 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -491,7 +491,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	sbi->sb_bmp_dir = NULL;
 	sbi->sb_cp_table = NULL;
 
-	init_MUTEX(&sbi->hpfs_creation_de);
+	mutex_init(&sbi->hpfs_creation_de);
 
 	uid = current_uid();
 	gid = current_gid();
-- 
cgit v1.2.3


From 51dfacdef38b1dd6fc58b03dd1725d517516b115 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 22:34:39 +0200
Subject: jbd2: Convert jbd2_slab_create_sem to mutex

jbd2_slab_create_sem is used as a mutex, so make it one.

[ akpm muttered: We may as well make it local to
jbd2_journal_create_slab() also. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.00.1010162231480.2496@localhost6.localdomain6>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jbd2/journal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 538417c1fdbb..c590d155c095 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1838,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
  */
 #define JBD2_MAX_SLABS 8
 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-static DECLARE_MUTEX(jbd2_slab_create_sem);
 
 static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
 	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1859,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
 
 static int jbd2_journal_create_slab(size_t size)
 {
+	static DEFINE_MUTEX(jbd2_slab_create_mutex);
 	int i = order_base_2(size) - 10;
 	size_t slab_size;
 
@@ -1870,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
 
 	if (unlikely(i < 0))
 		i = 0;
-	down(&jbd2_slab_create_sem);
+	mutex_lock(&jbd2_slab_create_mutex);
 	if (jbd2_slab[i]) {
-		up(&jbd2_slab_create_sem);
+		mutex_unlock(&jbd2_slab_create_mutex);
 		return 0;	/* Already created */
 	}
 
 	slab_size = 1 << (i+10);
 	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
 					 slab_size, 0, NULL);
-	up(&jbd2_slab_create_sem);
+	mutex_unlock(&jbd2_slab_create_mutex);
 	if (!jbd2_slab[i]) {
 		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
 		return -ENOMEM;
-- 
cgit v1.2.3


From 6418c96107a2b399848bb8cfc6e29f11ca74fb94 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 30 Oct 2010 07:34:24 -0400
Subject: Btrfs: deal with errors from updating the tree log

During unlink we remove any references to the inode from
the tree log.  It can return -ENOENT and other errors,
and this changes the unlink code to deal with it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 609f3bbbd1ed..5132c9af888a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2676,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
-	BUG_ON(ret);
+	if (ret == -ENOENT)
+		ret = 0;
 err:
 	btrfs_free_path(path);
 	if (ret)
-- 
cgit v1.2.3


From 504b701bb1655747575095543c083267418e02ac Mon Sep 17 00:00:00 2001
From: wu zhangjin <wuzhangjin@gmail.com>
Date: Sat, 30 Oct 2010 08:19:35 -0700
Subject: fs/compat.c: fix build on MIPS/s390

The definition of PAGE_CACHE_MASK in <linux/pagemap.h> is needed to use
MAX_RW_COUNT, and on x86-64 that gets done indirectly through the
architecture header includes.  But on MIPS and s390 that doesn't happen,
and we need to make sure that fs/compat.c includes pagemap.h explicitly.

Introduced in commit 435f49a518c7 ("readv/writev: do the same
MAX_RW_COUNT truncation that read/write does").

Reported-by: Sachin Sant <sachinp@in.ibm.com> (S390)
Reported-by: wu zhangjin <wuzhangjin@gmail.com> (MIPS)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index ff66c0d7583d..c580c322fa6b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
-- 
cgit v1.2.3


From cdf01dd5443d0befc8c6a32cb2e3d2f568fd2558 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 30 Oct 2010 08:55:52 -0700
Subject: fs-writeback.c: unify some common code

The btrfs merge looks like hell, because it changes fs-writeback.c, and
the crazy code has this repeated "estimate number of dirty pages"
counting that involves three different helper functions.  And it's done
in two different places.

Just unify that whole calculation as a "get_nr_dirty_pages()" helper
function, and the merge result will look half-way decent.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index aed881a76b22..f027382b54be 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -707,6 +707,17 @@ get_next_work_item(struct backing_dev_info *bdi)
 	return work;
 }
 
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+	return global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) +
+		get_nr_dirty_inodes();
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
 	unsigned long expired;
@@ -724,13 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 		return 0;
 
 	wb->last_old_flush = jiffies;
-	/*
-	 * Add in the number of potentially dirty inodes, because each inode
-	 * write can dirty pagecache in the underlying blockdev.
-	 */
-	nr_pages = global_page_state(NR_FILE_DIRTY) +
-			global_page_state(NR_UNSTABLE_NFS) +
-			get_nr_dirty_inodes();
+	nr_pages = get_nr_dirty_pages();
 
 	if (nr_pages) {
 		struct wb_writeback_work work = {
@@ -1086,8 +1091,6 @@ static void wait_sb_inodes(struct super_block *sb)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
@@ -1097,7 +1100,7 @@ void writeback_inodes_sb(struct super_block *sb)
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
+	work.nr_pages = get_nr_dirty_pages();
 
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
-- 
cgit v1.2.3


From 1a5cea7215f7c6bd3c960d7b44e864f3a73d1ad4 Mon Sep 17 00:00:00 2001
From: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Date: Fri, 29 Oct 2010 12:06:42 +0200
Subject: make fanotify_read() restartable across signals

    In fanotify_read() return -ERESTARTSYS instead of -EINTR to
    make read() restartable across signals (BSD semantic).

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index fce66dfbf7d5..063224812b7e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -330,7 +330,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		ret = -EAGAIN;
 		if (file->f_flags & O_NONBLOCK)
 			break;
-		ret = -EINTR;
+		ret = -ERESTARTSYS;
 		if (signal_pending(current))
 			break;
 
-- 
cgit v1.2.3


From 0ceaf6c700f8245946a163e387add8675a0c302f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Oct 2010 17:31:13 -0400
Subject: locks: prevent ENOMEM on lease unlock

Removing a lock shouldn't require any allocations; a failure due to
ENOMEM leaves the caller with a choice between retrying or giving up and
leaking an unused lease.

Next we should split the other lease calls into add and delete cases.
I wanted to start with just the bugfix.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 50ec15927aab..06c77734f589 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1441,7 +1441,8 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	return 0;
 
 out:
-	locks_free_lock(lease);
+	if (arg != F_UNLCK)
+		locks_free_lock(lease);
 	return error;
 }
 EXPORT_SYMBOL(generic_setlease);
@@ -1493,17 +1494,16 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 }
 EXPORT_SYMBOL_GPL(vfs_setlease);
 
-/**
- *	fcntl_setlease	-	sets a lease on an open file
- *	@fd: open file descriptor
- *	@filp: file pointer
- *	@arg: type of lease to obtain
- *
- *	Call this fcntl to establish a lease on the file.
- *	Note that you also need to call %F_SETSIG to
- *	receive a signal when the lease is broken.
- */
-int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+static int do_fcntl_delete_lease(struct file *filp)
+{
+	struct file_lock fl, *flp = &fl;
+
+	lease_init(filp, F_UNLCK, flp);
+
+	return vfs_setlease(filp, F_UNLCK, &flp);
+}
+
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
 	struct file_lock *fl;
 	struct fasync_struct *new;
@@ -1521,7 +1521,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 	}
 	lock_flocks();
 	error = __vfs_setlease(filp, arg, &fl);
-	if (error || arg == F_UNLCK)
+	if (error)
 		goto out_unlock;
 
 	/*
@@ -1549,6 +1549,23 @@ out_unlock:
 	return error;
 }
 
+/**
+ *	fcntl_setlease	-	sets a lease on an open file
+ *	@fd: open file descriptor
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *
+ *	Call this fcntl to establish a lease on the file.
+ *	Note that you also need to call %F_SETSIG to
+ *	receive a signal when the lease is broken.
+ */
+int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+{
+	if (arg == F_UNLCK)
+		return do_fcntl_delete_lease(filp);
+	return do_fcntl_add_lease(fd, filp, arg);
+}
+
 /**
  * flock_lock_file_wait - Apply a FLOCK-style lock to a file
  * @filp: The file to apply the lock to
-- 
cgit v1.2.3


From 096657b65e1ac197e20be5ce7cff6b6ca2532787 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Oct 2010 17:31:14 -0400
Subject: locks: fix leaks on setlease errors

We're depending on setlease to free the passed-in lease on failure.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 06c77734f589..63fbc41cc573 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1371,20 +1371,22 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	struct inode *inode = dentry->d_inode;
 	int error, rdlease_count = 0, wrlease_count = 0;
 
+	lease = *flp;
+
+	error = -EACCES;
 	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
-		return -EACCES;
+		goto out;
+	error = -EINVAL;
 	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
+		goto out;
 	error = security_file_lock(filp, arg);
 	if (error)
-		return error;
+		goto out;
 
 	time_out_leases(inode);
 
 	BUG_ON(!(*flp)->fl_lmops->fl_break);
 
-	lease = *flp;
-
 	if (arg != F_UNLCK) {
 		error = -EAGAIN;
 		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
-- 
cgit v1.2.3


From 05fa3135fdc7b9b510b502a35b6b97d2b38c6f48 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Oct 2010 17:31:15 -0400
Subject: locks: fix setlease methods to free passed-in lock

We modified setlease to require the caller to allocate the new lease in
the case of creating a new lease, but forgot to fix up the filesystem
methods.

Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Steve French <sfrench@samba.org>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifsfs.c   | 5 ++++-
 fs/gfs2/file.c     | 2 ++
 fs/locks.c         | 3 ++-
 fs/nfs/file.c      | 3 ++-
 include/linux/fs.h | 1 +
 5 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa79588..54745b6c3db9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -625,8 +625,11 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 		   knows that the file won't be changed on the server
 		   by anyone else */
 		return generic_setlease(file, arg, lease);
-	else
+	else {
+		if (arg != F_UNLCK)
+			locks_free_lock(*lease);
 		return -EAGAIN;
+	}
 }
 
 struct file_system_type cifs_fs_type = {
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5c..ac943c1307b5 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -629,6 +629,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
 {
+	if (arg != F_UNLCK)
+		locks_free_lock(*fl);
 	return -EINVAL;
 }
 
diff --git a/fs/locks.c b/fs/locks.c
index 63fbc41cc573..5b526a977882 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -186,7 +186,7 @@ void locks_release_private(struct file_lock *fl)
 EXPORT_SYMBOL_GPL(locks_release_private);
 
 /* Free a lock which is not in use. */
-static void locks_free_lock(struct file_lock *fl)
+void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
 	BUG_ON(!list_empty(&fl->fl_block));
@@ -195,6 +195,7 @@ static void locks_free_lock(struct file_lock *fl)
 	locks_release_private(fl);
 	kmem_cache_free(filelock_cache, fl);
 }
+EXPORT_SYMBOL(locks_free_lock);
 
 void locks_init_lock(struct file_lock *fl)
 {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e756075637b0..1e524fb73ba5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -884,6 +884,7 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
 			file->f_path.dentry->d_parent->d_name.name,
 			file->f_path.dentry->d_name.name, arg);
-
+	if (arg != F_UNLCK)
+		locks_free_lock(*fl);
 	return -EINVAL;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b7b507ffa1c..1eb29399a4ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1129,6 +1129,7 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
 extern int fcntl_getlease(struct file *filp);
 
 /* fs/locks.c */
+void locks_free_lock(struct file_lock *fl);
 extern void locks_init_lock(struct file_lock *);
 extern struct file_lock * locks_alloc_lock(void);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
-- 
cgit v1.2.3


From fcf744a96c66ca6ad7301a372034b771e57f30c4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 30 Oct 2010 17:31:16 -0400
Subject: nfsd4: initialize delegation pointer to lease

The NFSv4 server was initializing the dp->dl_flock pointer by the
somewhat ridiculous method of a locks_copy_lock callback.

Now that setlease uses the passed-in lock instead of doing a copy,
dl_flock no longer gets set, resulting in the lock leaking on delegation
release, and later possible hangs (among other problems).

So, initialize dl_flock and get rid of the callback.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs4state.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 56347e0ac88d..b7f818b0580c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2309,22 +2309,6 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
 	dp->dl_flock = NULL;
 }
 
-/*
- * Set the delegation file_lock back pointer.
- *
- * Called from setlease() with lock_kernel() held.
- */
-static
-void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
-{
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
-
-	dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
-	if (!dp)
-		return;
-	dp->dl_flock = new;
-}
-
 /*
  * Called from setlease() with lock_kernel() held
  */
@@ -2355,7 +2339,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
 	.fl_release_private = nfsd_release_deleg_cb,
-	.fl_copy_lock = nfsd_copy_lock_deleg_cb,
 	.fl_mylease = nfsd_same_client_deleg_cb,
 	.fl_change = nfsd_change_deleg_cb,
 };
@@ -2661,12 +2644,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	fl->fl_file = find_readable_file(stp->st_file);
 	BUG_ON(!fl->fl_file);
 	fl->fl_pid = current->tgid;
+	dp->dl_flock = fl;
 
 	/* vfs_setlease checks to see if delegation should be handed out.
 	 * the lock_manager callbacks fl_mylease and fl_change are used
 	 */
 	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
+		dp->dl_flock = NULL;
 		unhash_delegation(dp);
 		flag = NFS4_OPEN_DELEGATE_NONE;
 		goto out;
-- 
cgit v1.2.3


From 51ee4b84f5c86935b438d6636f34b523edb415a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 31 Oct 2010 08:35:10 -0400
Subject: locks: let the caller free file_lock on ->setlease failure

The caller allocated it, the caller should free it.

The only issue so far is that we could change the flp pointer even on an
error return if the fl_change callback failed.  But we can simply move
the flp assignment after the fl_change invocation, as the callers don't
care about the flp return value if the setlease call failed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifsfs.c    |  5 +----
 fs/gfs2/file.c      |  2 --
 fs/locks.c          | 20 +++++++++++---------
 fs/nfs/file.c       |  2 --
 fs/nfsd/nfs4state.c |  1 +
 5 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 54745b6c3db9..75c4eaa79588 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -625,11 +625,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 		   knows that the file won't be changed on the server
 		   by anyone else */
 		return generic_setlease(file, arg, lease);
-	else {
-		if (arg != F_UNLCK)
-			locks_free_lock(*lease);
+	else
 		return -EAGAIN;
-	}
 }
 
 struct file_system_type cifs_fs_type = {
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ac943c1307b5..aa996471ec5c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -629,8 +629,6 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-	if (arg != F_UNLCK)
-		locks_free_lock(*fl);
 	return -EINVAL;
 }
 
diff --git a/fs/locks.c b/fs/locks.c
index 5b526a977882..a2ab790471b5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1428,8 +1428,9 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		goto out;
 
 	if (my_before != NULL) {
-		*flp = *my_before;
 		error = lease->fl_lmops->fl_change(my_before, arg);
+		if (!error)
+			*flp = *my_before;
 		goto out;
 	}
 
@@ -1444,8 +1445,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	return 0;
 
 out:
-	if (arg != F_UNLCK)
-		locks_free_lock(lease);
 	return error;
 }
 EXPORT_SYMBOL(generic_setlease);
@@ -1524,8 +1523,11 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 	}
 	lock_flocks();
 	error = __vfs_setlease(filp, arg, &fl);
-	if (error)
-		goto out_unlock;
+	if (error) {
+		unlock_flocks();
+		locks_free_lock(fl);
+		goto out_free_fasync;
+	}
 
 	/*
 	 * fasync_insert_entry() returns the old entry if any.
@@ -1541,12 +1543,12 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		fl->fl_type = F_UNLCK | F_INPROGRESS;
 		fl->fl_break_time = jiffies - 10;
 		time_out_leases(inode);
-		goto out_unlock;
+	} else {
+		error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 	}
-
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-out_unlock:
 	unlock_flocks();
+
+out_free_fasync:
 	if (new)
 		fasync_free(new);
 	return error;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1e524fb73ba5..60677f9f1311 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -884,7 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
 			file->f_path.dentry->d_parent->d_name.name,
 			file->f_path.dentry->d_name.name, arg);
-	if (arg != F_UNLCK)
-		locks_free_lock(*fl);
 	return -EINVAL;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b7f818b0580c..f1e5ec6b5105 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2652,6 +2652,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
 		dp->dl_flock = NULL;
+		locks_free_lock(fl);
 		unhash_delegation(dp);
 		flag = NFS4_OPEN_DELEGATE_NONE;
 		goto out;
-- 
cgit v1.2.3


From bb8430a2c8fe2b726033017daadf73c69b0348ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 31 Oct 2010 08:35:31 -0400
Subject: locks: remove fl_copy_lock lock_manager operation

This one was only used for a nasty hack in nfsd, which has recently
been removed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking | 2 --
 fs/locks.c                        | 5 +----
 include/linux/fs.h                | 1 -
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 8a817f656f0a..a91f30890011 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -322,7 +322,6 @@ fl_release_private:	yes	yes
 prototypes:
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);  /* unblock callback */
-	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *); /* break_lease callback */
 
@@ -330,7 +329,6 @@ locking rules:
 			BKL	may block
 fl_compare_owner:	yes	no
 fl_notify:		yes	no
-fl_copy_lock:		yes	no
 fl_release_private:	yes	yes
 fl_break:		yes	no
 
diff --git a/fs/locks.c b/fs/locks.c
index a2ab790471b5..65765cb6afed 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -235,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
 			fl->fl_ops->fl_copy_lock(new, fl);
 		new->fl_ops = fl->fl_ops;
 	}
-	if (fl->fl_lmops) {
-		if (fl->fl_lmops->fl_copy_lock)
-			fl->fl_lmops->fl_copy_lock(new, fl);
+	if (fl->fl_lmops)
 		new->fl_lmops = fl->fl_lmops;
-	}
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1eb29399a4ff..334d68a17108 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1056,7 +1056,6 @@ struct lock_manager_operations {
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);	/* unblock callback */
 	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
-	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *);
 	int (*fl_mylease)(struct file_lock *, struct file_lock *);
-- 
cgit v1.2.3


From e99d11d19977c74b18411cdb59cdebb788237a6e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 2 Nov 2010 05:29:21 +0900
Subject: fs: logfs: Fix up MTD=y build.

Commit 7d945a3aa760 ("logfs get_sb, part 3") broke the logfs build when
CONFIG_MTD is set due to a mangled logfs_get_sb_mtd() definition.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/logfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index cd51a36b37f0..57afd4a6fabb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -486,7 +486,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
 
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
 #else
 static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 {
-- 
cgit v1.2.3


From 50ae28f0144a790fc63a5b89b9aca3ffa9f88522 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 1 Nov 2010 16:08:55 +0100
Subject: FS: cifs, remove unneeded NULL tests

Stanse found that pSMBFile in cifs_ioctl and file->f_path.dentry in
cifs_user_write are dereferenced prior their test to NULL.

The alternative is not to dereference them before the tests. The patch is
to point out the problem, you have to decide.

While at it we cache the inode in cifs_user_write to a local variable
and use all over the function.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Steve French <sfrench@samba.org>
Cc: linux-cifs@vger.kernel.org
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c  | 25 +++++++++++--------------
 fs/cifs/ioctl.c |  4 ----
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ae82159cf7fa..5d06eb3078de 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -956,6 +956,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	size_t write_size, loff_t *poffset)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	int rc = 0;
 	unsigned int bytes_written = 0;
 	unsigned int total_written;
@@ -963,7 +964,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	struct cifsTconInfo *pTcon;
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
@@ -1029,21 +1030,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	cifs_stats_bytes_written(pTcon, total_written);
 
-	/* since the write may have blocked check these pointers again */
-	if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-		struct inode *inode = file->f_path.dentry->d_inode;
 /* Do not update local mtime - server will set its actual value on write
- *		inode->i_ctime = inode->i_mtime =
- * 			current_fs_time(inode->i_sb);*/
-		if (total_written > 0) {
-			spin_lock(&inode->i_lock);
-			if (*poffset > file->f_path.dentry->d_inode->i_size)
-				i_size_write(file->f_path.dentry->d_inode,
-					*poffset);
-			spin_unlock(&inode->i_lock);
-		}
-		mark_inode_dirty_sync(file->f_path.dentry->d_inode);
+ *	inode->i_ctime = inode->i_mtime =
+ * 		current_fs_time(inode->i_sb);*/
+	if (total_written > 0) {
+		spin_lock(&inode->i_lock);
+		if (*poffset > inode->i_size)
+			i_size_write(inode, *poffset);
+		spin_unlock(&inode->i_lock);
 	}
+	mark_inode_dirty_sync(inode);
+
 	FreeXid(xid);
 	return total_written;
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 077bf756f342..2fa22f20cfc5 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -63,8 +63,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 #ifdef CONFIG_CIFS_POSIX
 		case FS_IOC_GETFLAGS:
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
-				if (pSMBFile == NULL)
-					break;
 				rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
 					&ExtAttrBits, &ExtAttrMask);
 				if (rc == 0)
@@ -80,8 +78,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 					rc = -EFAULT;
 					break;
 				}
-				if (pSMBFile == NULL)
-					break;
 				/* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
 					extAttrBits, &ExtAttrMask);*/
 			}
-- 
cgit v1.2.3


From eb8abb927ae2fd1730e24ea94cd9527f3c086292 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 2 Nov 2010 09:34:50 -0400
Subject: ext4: Remove useless spinlock in ext4_getattr()

Linus noted, and complained to me, that doing while lots of "git diff"'s
of kernel sources, these spinlocks were responsible for 27% of the
spinlock cost on his two-processor system as reported by perf.

Git was doing lots of parallel stats, and this was putting a lot of
pressure on ext4_getattr().  A spinlock to protect a single
memory-to-memory copy is pointless, so remove it.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 191616470466..4d78342f3bf0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5410,9 +5410,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	 * will return the blocks that include the delayed allocation
 	 * blocks for this file.
 	 */
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
 	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
 	return 0;
-- 
cgit v1.2.3


From f4245bd4ebf903541ba758ad06c118626d8c6f18 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 2 Nov 2010 14:07:17 -0400
Subject: ext4: fix lazyinit hang after removing request

When the request has been removed from the list and no other request
has been issued, we will end up with next wakeup scheduled to
MAX_JIFFY_OFFSET which is bad. So check for that.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af6..8d1d9423ce9a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2740,7 +2740,8 @@ cont_thread:
 		if (freezing(current))
 			refrigerator();
 
-		if (time_after_eq(jiffies, next_wakeup)) {
+		if ((time_after_eq(jiffies, next_wakeup)) ||
+		    (MAX_JIFFY_OFFSET == next_wakeup)) {
 			cond_resched();
 			continue;
 		}
-- 
cgit v1.2.3


From b2c78cd09b6ef78c8f20190f0b3e6df1d3651b70 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 2 Nov 2010 14:19:30 -0400
Subject: ext4: "ret" may be used uninitialized in ext4_lazyinit_thread()

Newer GCC's reported the following build warning:

   fs/ext4/super.c: In function 'ext4_lazyinit_thread':
   fs/ext4/super.c:2702: warning: 'ret' may be used uninitialized in this function

Fix it by removing the need for the ret variable in the first place.

Signed-off-by: "Lukas Czerner" <lczerner@redhat.com>
Reported-by: "Stefan Richter" <stefanr@s5r6.in-berlin.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8d1d9423ce9a..4d7ef31eacb1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2699,7 +2699,6 @@ static int ext4_lazyinit_thread(void *arg)
 	struct ext4_li_request *elr;
 	unsigned long next_wakeup;
 	DEFINE_WAIT(wait);
-	int ret;
 
 	BUG_ON(NULL == eli);
 
@@ -2723,13 +2722,12 @@ cont_thread:
 			elr = list_entry(pos, struct ext4_li_request,
 					 lr_request);
 
-			if (time_after_eq(jiffies, elr->lr_next_sched))
-				ret = ext4_run_li_request(elr);
-
-			if (ret) {
-				ret = 0;
-				ext4_remove_li_request(elr);
-				continue;
+			if (time_after_eq(jiffies, elr->lr_next_sched)) {
+				if (ext4_run_li_request(elr) != 0) {
+					/* error, remove the lazy_init job */
+					ext4_remove_li_request(elr);
+					continue;
+				}
 			}
 
 			if (time_before(elr->lr_next_sched, next_wakeup))
-- 
cgit v1.2.3


From e66673e39ac9d4749bd9676dd1caf928095409f5 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Tue, 2 Nov 2010 12:00:42 +0300
Subject: CIFS: Add cifs_set_oplock_level

Simplify many places when we need to set oplock level on an inode.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  1 +
 fs/cifs/file.c      | 38 +++++++++-----------------------------
 fs/cifs/misc.c      | 23 ++++++++++++++++++++---
 3 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index edb6d90efdf2..7f050f4fc3d9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,6 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 				      int offset);
+extern void cifs_set_oplock_level(struct inode *inode, __u32 oplock);
 
 extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
 				struct file *file, struct tcon_link *tlink,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5d06eb3078de..a566f155df49 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -146,12 +146,7 @@ client_can_cache:
 		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
 					 xid, NULL);
 
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = true;
-		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ)
-		pCifsInode->clientCanCacheRead = true;
+	cifs_set_oplock_level(inode, oplock);
 
 	return rc;
 }
@@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
 	spin_unlock(&cifs_file_list_lock);
 
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = true;
-		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock inode %p", inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ)
-		pCifsInode->clientCanCacheRead = true;
+	cifs_set_oplock_level(inode, oplock);
 
 	file->private_data = pCifsFile;
 	return pCifsFile;
@@ -271,8 +261,10 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
  */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
+	struct inode *inode = cifs_file->dentry->d_inode;
 	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
-	struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsLockInfo *li, *tmp;
 
 	spin_lock(&cifs_file_list_lock);
@@ -288,8 +280,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	if (list_empty(&cifsi->openFileList)) {
 		cFYI(1, "closing last open instance for inode %p",
 			cifs_file->dentry->d_inode);
-		cifsi->clientCanCacheRead = false;
-		cifsi->clientCanCacheAll  = false;
+		cifs_set_oplock_level(inode, 0);
 	}
 	spin_unlock(&cifs_file_list_lock);
 
@@ -607,8 +598,6 @@ reopen_success:
 		rc = filemap_write_and_wait(inode->i_mapping);
 		mapping_set_error(inode->i_mapping, rc);
 
-		pCifsInode->clientCanCacheAll = false;
-		pCifsInode->clientCanCacheRead = false;
 		if (tcon->unix_ext)
 			rc = cifs_get_inode_info_unix(&inode,
 				full_path, inode->i_sb, xid);
@@ -622,18 +611,9 @@ reopen_success:
 	     invalidate the current end of file on the server
 	     we can not go to the server to get the new inod
 	     info */
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = true;
-		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p",
-			 pCifsFile->dentry->d_inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ) {
-		pCifsInode->clientCanCacheRead = true;
-		pCifsInode->clientCanCacheAll = false;
-	} else {
-		pCifsInode->clientCanCacheRead = false;
-		pCifsInode->clientCanCacheAll = false;
-	}
+
+	cifs_set_oplock_level(inode, oplock);
+
 	cifs_relock_file(pCifsFile);
 
 reopen_error_exit:
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c4e296fe3518..d3b9ddebc17e 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 
 				cFYI(1, "file id match, oplock break");
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
-				pCifsInode->clientCanCacheAll = false;
-				if (pSMB->OplockLevel == 0)
-					pCifsInode->clientCanCacheRead = false;
 
+				cifs_set_oplock_level(netfile->dentry->d_inode,
+						      pSMB->OplockLevel);
 				/*
 				 * cifs_oplock_break_put() can't be called
 				 * from here.  Get reference after queueing
@@ -722,3 +721,21 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 			   cifs_sb_master_tcon(cifs_sb)->treeName);
 	}
 }
+
+void cifs_set_oplock_level(struct inode *inode, __u32 oplock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		cinode->clientCanCacheAll = true;
+		cinode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock granted on inode %p", inode);
+	} else if ((oplock & 0xF) == OPLOCK_READ) {
+		cinode->clientCanCacheAll = false;
+		cinode->clientCanCacheRead = true;
+		cFYI(1, "Level II Oplock granted on inode %p", inode);
+	} else {
+		cinode->clientCanCacheAll = false;
+		cinode->clientCanCacheRead = false;
+	}
+}
-- 
cgit v1.2.3


From df098db12ada832c0232ee1f91eff21a8701889c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Sat, 30 Oct 2010 17:06:21 -0400
Subject: cifs: trivial doc fix: note setlease implemented

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/TODO | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
 
 v) mount check for unmatched uids
 
-w) Add support for new vfs entry points for setlease and fallocate 
+w) Add support for new vfs entry point for fallocate
 
 x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
 processes can proceed better in parallel (on the server)
-- 
cgit v1.2.3


From 413e661c136c52290de1ee19a1b049a4da9dbf51 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 28 Oct 2010 13:33:38 -0400
Subject: cifs: store pointer to master tlink in superblock (try #2)

This is the second version of this patch, the only difference between
it and the first one is that this explicitly makes cifs_sb_master_tlink
a static inline.

Instead of keeping a tag on the master tlink in the tree, just keep a
pointer to the master in the superblock. That eliminates the need for
using the radix tree to look up a tagged entry.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |  2 +-
 fs/cifs/connect.c    | 20 ++++----------------
 2 files changed, 5 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 525ba59a4105..79576dac336f 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,8 +43,8 @@
 
 struct cifs_sb_info {
 	struct radix_tree_root tlink_tree;
-#define CIFS_TLINK_MASTER_TAG		0	/* is "master" (mount) tcon */
 	spinlock_t tlink_tree_lock;
+	struct tcon_link *master_tlink;
 	struct nls_table *local_nls;
 	unsigned int rsize;
 	unsigned int wsize;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9eb327defa1d..197ac579a70b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2914,11 +2914,11 @@ remote_path_check:
 
 	spin_lock(&cifs_sb->tlink_tree_lock);
 	radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
-	radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
-			   CIFS_TLINK_MASTER_TAG);
 	spin_unlock(&cifs_sb->tlink_tree_lock);
 	radix_tree_preload_end();
 
+	cifs_sb->master_tlink = tlink;
+
 	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
 				TLINK_IDLE_EXPIRE);
 
@@ -3271,22 +3271,10 @@ out:
 	return tcon;
 }
 
-static struct tcon_link *
+static inline struct tcon_link *
 cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
 {
-	struct tcon_link *tlink;
-	unsigned int ret;
-
-	spin_lock(&cifs_sb->tlink_tree_lock);
-	ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
-					0, 1, CIFS_TLINK_MASTER_TAG);
-	spin_unlock(&cifs_sb->tlink_tree_lock);
-
-	/* the master tcon should always be present */
-	if (ret == 0)
-		BUG();
-
-	return tlink;
+	return cifs_sb->master_tlink;
 }
 
 struct cifsTconInfo *
-- 
cgit v1.2.3


From b647c35f77af9c07d336247b23014596e9f0a593 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 28 Oct 2010 11:16:44 -0400
Subject: cifs: convert tlink_tree to a rbtree

Radix trees are ideal when you want to track a bunch of pointers and
can't embed a tracking structure within the target of those pointers.
The tradeoff is an increase in memory, particularly if the tree is
sparse.

In CIFS, we use the tlink_tree to track tcon_link structs. A tcon_link
can never be in more than one tlink_tree, so there's no impediment to
using a rb_tree here instead of a radix tree.

Convert the new multiuser mount code to use a rb_tree instead. This
should reduce the memory required to manage the tlink_tree.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |   4 +-
 fs/cifs/cifsfs.c     |   2 +-
 fs/cifs/cifsglob.h   |   3 +-
 fs/cifs/connect.c    | 177 ++++++++++++++++++++++++++++-----------------------
 4 files changed, 101 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 79576dac336f..e9a393c9c2ca 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,7 +15,7 @@
  *   the GNU Lesser General Public License for more details.
  *
  */
-#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
@@ -42,7 +42,7 @@
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 
 struct cifs_sb_info {
-	struct radix_tree_root tlink_tree;
+	struct rb_root tlink_tree;
 	spinlock_t tlink_tree_lock;
 	struct tcon_link *master_tlink;
 	struct nls_table *local_nls;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa79588..38526a6c4acf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data,
 		return -ENOMEM;
 
 	spin_lock_init(&cifs_sb->tlink_tree_lock);
-	INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
+	cifs_sb->tlink_tree = RB_ROOT;
 
 	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
 	if (rc) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f259e4d7612d..b577bf0a1bb3 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -336,7 +336,8 @@ struct cifsTconInfo {
  * "get" on the container.
  */
 struct tcon_link {
-	unsigned long		tl_index;
+	struct rb_node		tl_rbnode;
+	uid_t			tl_uid;
 	unsigned long		tl_flags;
 #define TCON_LINK_MASTER	0
 #define TCON_LINK_PENDING	1
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 197ac579a70b..c9699ce767b6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -116,6 +116,7 @@ struct smb_vol {
 
 static int ipv4_connect(struct TCP_Server_Info *server);
 static int ipv6_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
 
 /*
@@ -2900,24 +2901,16 @@ remote_path_check:
 		goto mount_fail_check;
 	}
 
-	tlink->tl_index = pSesInfo->linux_uid;
+	tlink->tl_uid = pSesInfo->linux_uid;
 	tlink->tl_tcon = tcon;
 	tlink->tl_time = jiffies;
 	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
 	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
 
-	rc = radix_tree_preload(GFP_KERNEL);
-	if (rc == -ENOMEM) {
-		kfree(tlink);
-		goto mount_fail_check;
-	}
-
+	cifs_sb->master_tlink = tlink;
 	spin_lock(&cifs_sb->tlink_tree_lock);
-	radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
+	tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
 	spin_unlock(&cifs_sb->tlink_tree_lock);
-	radix_tree_preload_end();
-
-	cifs_sb->master_tlink = tlink;
 
 	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
 				TLINK_IDLE_EXPIRE);
@@ -3107,32 +3100,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-	int i, ret;
+	struct rb_root *root = &cifs_sb->tlink_tree;
+	struct rb_node *node;
+	struct tcon_link *tlink;
 	char *tmp;
-	struct tcon_link *tlink[8];
-	unsigned long index = 0;
 
 	cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
 
-	do {
-		spin_lock(&cifs_sb->tlink_tree_lock);
-		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
-					     (void **)tlink, index,
-					     ARRAY_SIZE(tlink));
-		/* increment index for next pass */
-		if (ret > 0)
-			index = tlink[ret - 1]->tl_index + 1;
-		for (i = 0; i < ret; i++) {
-			cifs_get_tlink(tlink[i]);
-			clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
-			radix_tree_delete(&cifs_sb->tlink_tree,
-							tlink[i]->tl_index);
-		}
-		spin_unlock(&cifs_sb->tlink_tree_lock);
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	while ((node = rb_first(root))) {
+		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+		cifs_get_tlink(tlink);
+		clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+		rb_erase(node, root);
 
-		for (i = 0; i < ret; i++)
-			cifs_put_tlink(tlink[i]);
-	} while (ret != 0);
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		cifs_put_tlink(tlink);
+		spin_lock(&cifs_sb->tlink_tree_lock);
+	}
+	spin_unlock(&cifs_sb->tlink_tree_lock);
 
 	tmp = cifs_sb->prepath;
 	cifs_sb->prepathlen = 0;
@@ -3290,6 +3276,47 @@ cifs_sb_tcon_pending_wait(void *unused)
 	return signal_pending(current) ? -ERESTARTSYS : 0;
 }
 
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+	struct rb_node *node = root->rb_node;
+	struct tcon_link *tlink;
+
+	while (node) {
+		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+
+		if (tlink->tl_uid > uid)
+			node = node->rb_left;
+		else if (tlink->tl_uid < uid)
+			node = node->rb_right;
+		else
+			return tlink;
+	}
+	return NULL;
+}
+
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct tcon_link *tlink;
+
+	while (*new) {
+		tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+		parent = *new;
+
+		if (tlink->tl_uid > new_tlink->tl_uid)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&new_tlink->tl_rbnode, parent, new);
+	rb_insert_color(&new_tlink->tl_rbnode, root);
+}
+
 /*
  * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
  * current task.
@@ -3310,14 +3337,14 @@ struct tcon_link *
 cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
 {
 	int ret;
-	unsigned long fsuid = (unsigned long) current_fsuid();
+	uid_t fsuid = current_fsuid();
 	struct tcon_link *tlink, *newtlink;
 
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
 		return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
 
 	spin_lock(&cifs_sb->tlink_tree_lock);
-	tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+	tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
 	if (tlink)
 		cifs_get_tlink(tlink);
 	spin_unlock(&cifs_sb->tlink_tree_lock);
@@ -3326,36 +3353,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
 		newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
 		if (newtlink == NULL)
 			return ERR_PTR(-ENOMEM);
-		newtlink->tl_index = fsuid;
+		newtlink->tl_uid = fsuid;
 		newtlink->tl_tcon = ERR_PTR(-EACCES);
 		set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
 		set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
 		cifs_get_tlink(newtlink);
 
-		ret = radix_tree_preload(GFP_KERNEL);
-		if (ret != 0) {
-			kfree(newtlink);
-			return ERR_PTR(ret);
-		}
-
 		spin_lock(&cifs_sb->tlink_tree_lock);
 		/* was one inserted after previous search? */
-		tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+		tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
 		if (tlink) {
 			cifs_get_tlink(tlink);
 			spin_unlock(&cifs_sb->tlink_tree_lock);
-			radix_tree_preload_end();
 			kfree(newtlink);
 			goto wait_for_construction;
 		}
-		ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
-		spin_unlock(&cifs_sb->tlink_tree_lock);
-		radix_tree_preload_end();
-		if (ret) {
-			kfree(newtlink);
-			return ERR_PTR(ret);
-		}
 		tlink = newtlink;
+		tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+		spin_unlock(&cifs_sb->tlink_tree_lock);
 	} else {
 wait_for_construction:
 		ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
@@ -3401,39 +3416,39 @@ cifs_prune_tlinks(struct work_struct *work)
 {
 	struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
 						    prune_tlinks.work);
-	struct tcon_link *tlink[8];
-	unsigned long now = jiffies;
-	unsigned long index = 0;
-	int i, ret;
+	struct rb_root *root = &cifs_sb->tlink_tree;
+	struct rb_node *node = rb_first(root);
+	struct rb_node *tmp;
+	struct tcon_link *tlink;
 
-	do {
-		spin_lock(&cifs_sb->tlink_tree_lock);
-		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
-					     (void **)tlink, index,
-					     ARRAY_SIZE(tlink));
-		/* increment index for next pass */
-		if (ret > 0)
-			index = tlink[ret - 1]->tl_index + 1;
-		for (i = 0; i < ret; i++) {
-			if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
-			    atomic_read(&tlink[i]->tl_count) != 0 ||
-			    time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
-				       now)) {
-				tlink[i] = NULL;
-				continue;
-			}
-			cifs_get_tlink(tlink[i]);
-			clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
-			radix_tree_delete(&cifs_sb->tlink_tree,
-					  tlink[i]->tl_index);
-		}
-		spin_unlock(&cifs_sb->tlink_tree_lock);
+	/*
+	 * Because we drop the spinlock in the loop in order to put the tlink
+	 * it's not guarded against removal of links from the tree. The only
+	 * places that remove entries from the tree are this function and
+	 * umounts. Because this function is non-reentrant and is canceled
+	 * before umount can proceed, this is safe.
+	 */
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	node = rb_first(root);
+	while (node != NULL) {
+		tmp = node;
+		node = rb_next(tmp);
+		tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
+
+		if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
+		    atomic_read(&tlink->tl_count) != 0 ||
+		    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
+			continue;
 
-		for (i = 0; i < ret; i++) {
-			if (tlink[i] != NULL)
-				cifs_put_tlink(tlink[i]);
-		}
-	} while (ret != 0);
+		cifs_get_tlink(tlink);
+		clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+		rb_erase(tmp, root);
+
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		cifs_put_tlink(tlink);
+		spin_lock(&cifs_sb->tlink_tree_lock);
+	}
+	spin_unlock(&cifs_sb->tlink_tree_lock);
 
 	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
 				TLINK_IDLE_EXPIRE);
-- 
cgit v1.2.3


From 54eeafe1e4fb7b11da17adacacb1fbe279e0cf6e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 2 Nov 2010 19:22:45 +0000
Subject: [CIFS] Cleanup unused variable build warning

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a566f155df49..71185d1d310a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -264,7 +264,6 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	struct inode *inode = cifs_file->dentry->d_inode;
 	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsLockInfo *li, *tmp;
 
 	spin_lock(&cifs_file_list_lock);
-- 
cgit v1.2.3


From 21b75b019983dfa5c2dda588f4b60b4ca69844a4 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 26 Oct 2010 10:07:17 -0400
Subject: nfsd4: fix 4.1 connection registration race

If a connection is closed just after a sequence or create_session
is sent over it, we could end up trying to register a callback that will
never get called since the xprt is already marked dead.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c             | 16 ++++++++++++----
 include/linux/sunrpc/svc_xprt.h | 18 ++++++++++++++----
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f1e5ec6b5105..ad2bfa68d534 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
 	spin_unlock(&clp->cl_lock);
 }
 
-static void nfsd4_register_conn(struct nfsd4_conn *conn)
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
 {
 	conn->cn_xpt_user.callback = nfsd4_conn_lost;
-	register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+	return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
 
 static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 {
 	struct nfsd4_conn *conn;
 	u32 flags = NFS4_CDFC4_FORE;
+	int ret;
 
 	if (ses->se_flags & SESSION4_BACK_CHAN)
 		flags |= NFS4_CDFC4_BACK;
@@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 	if (!conn)
 		return nfserr_jukebox;
 	nfsd4_hash_conn(conn, ses);
-	nfsd4_register_conn(conn);
+	ret = nfsd4_register_conn(conn);
+	if (ret)
+		/* oops; xprt is already down: */
+		nfsd4_conn_lost(&conn->cn_xpt_user);
 	return nfs_ok;
 }
 
@@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
 {
 	struct nfs4_client *clp = ses->se_client;
 	struct nfsd4_conn *c;
+	int ret;
 
 	spin_lock(&clp->cl_lock);
 	c = __nfsd4_find_conn(new->cn_xprt, ses);
@@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
 	}
 	__nfsd4_hash_conn(new, ses);
 	spin_unlock(&clp->cl_lock);
-	nfsd4_register_conn(new);
+	ret = nfsd4_register_conn(new);
+	if (ret)
+		/* oops; xprt is already down: */
+		nfsd4_conn_lost(&new->cn_xpt_user);
 	return;
 }
 
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index bbdb680ffbe9..aea0d438e3c7 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -82,18 +82,28 @@ struct svc_xprt {
 	struct net		*xpt_net;
 };
 
-static inline void register_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
+static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
 {
 	spin_lock(&xpt->xpt_lock);
-	list_add(&u->list, &xpt->xpt_users);
+	list_del_init(&u->list);
 	spin_unlock(&xpt->xpt_lock);
 }
 
-static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
+static inline int register_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
 {
 	spin_lock(&xpt->xpt_lock);
-	list_del_init(&u->list);
+	if (test_bit(XPT_CLOSE, &xpt->xpt_flags)) {
+		/*
+		 * The connection is about to be deleted soon (or,
+		 * worse, may already be deleted--in which case we've
+		 * already notified the xpt_users).
+		 */
+		spin_unlock(&xpt->xpt_lock);
+		return -ENOTCONN;
+	}
+	list_add(&u->list, &xpt->xpt_users);
 	spin_unlock(&xpt->xpt_lock);
+	return 0;
 }
 
 int	svc_reg_xprt_class(struct svc_xprt_class *);
-- 
cgit v1.2.3


From ce7e010aef63dc6b37a2354f7c9f5f4aedb37978 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 3 Nov 2010 12:03:21 -0400
Subject: ext4: initialize the percpu counters before replaying the journal

We now initialize the percpu counters before replaying the journal,
but after the journal, we recalculate the global counters, to deal
with the possibility of the per-blockgroup counts getting updated by
the journal replay.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 65 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4d7ef31eacb1..04352e9729d0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3347,6 +3347,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+			ext4_count_free_blocks(sb));
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext4_count_free_inodes(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext4_count_dirs(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	}
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount3;
+	}
+
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 
@@ -3445,22 +3463,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
-no_journal:
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-				  ext4_count_free_blocks(sb));
-	if (!err)
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-					  ext4_count_free_inodes(sb));
-	if (!err)
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-					  ext4_count_dirs(sb));
-	if (!err)
-		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-	if (err) {
-		ext4_msg(sb, KERN_ERR, "insufficient memory");
-		goto failed_mount_wq;
-	}
+	/*
+	 * The journal may have updated the bg summary counts, so we
+	 * need to update the global counters.
+	 */
+	percpu_counter_set(&sbi->s_freeblocks_counter,
+			   ext4_count_free_blocks(sb));
+	percpu_counter_set(&sbi->s_freeinodes_counter,
+			   ext4_count_free_inodes(sb));
+	percpu_counter_set(&sbi->s_dirs_counter,
+			   ext4_count_dirs(sb));
+	percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 
+no_journal:
 	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3610,10 +3625,6 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
 	if (sbi->s_flex_groups) {
 		if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3621,6 +3632,10 @@ failed_mount3:
 		else
 			kfree(sbi->s_flex_groups);
 	}
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -3948,13 +3963,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	else
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
-		ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-					&EXT4_SB(sb)->s_freeblocks_counter));
-	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
-		es->s_free_inodes_count =
-			cpu_to_le32(percpu_counter_sum_positive(
-					&EXT4_SB(sb)->s_freeinodes_counter));
+	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+					   &EXT4_SB(sb)->s_freeblocks_counter));
+	es->s_free_inodes_count =
+		cpu_to_le32(percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeinodes_counter));
 	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-- 
cgit v1.2.3


From 6ef933a38ade555a175ecab9d803e6bb73399763 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Wed, 3 Nov 2010 10:53:49 +0530
Subject: cifs: trivial comment fix: tlink_tree is now a rbtree

Noticed while reviewing (late) the rbtree conversion patchset (which has been merged
already).

Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c9699ce767b6..251a17c03545 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3324,7 +3324,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
  * If the superblock doesn't refer to a multiuser mount, then just return
  * the master tcon for the mount.
  *
- * First, search the radix tree for an existing tcon for this fsuid. If one
+ * First, search the rbtree for an existing tcon for this fsuid. If one
  * exists, then check to see if it's pending construction. If it is then wait
  * for construction to complete. Once it's no longer pending, check to see if
  * it failed and either return an error or retry construction, depending on
-- 
cgit v1.2.3


From d38922949d377da7d47473c7868334408ae3b373 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 2 Nov 2010 16:22:50 -0400
Subject: cifs: dereferencing first then checking

This patch is based on Dan's original patch. His original description is
below:

Smatch complained about a couple checking for NULL after dereferencing
bugs.  I'm not super familiar with the code so I did the conservative
thing and move the dereferences after the checks.

The dereferences in cifs_lock() and cifs_fsync() were added in
ba00ba64cf0 "cifs: make various routines use the cifsFileInfo->tcon
pointer".  The dereference in find_writable_file() was added in
6508d904e6f "cifs: have find_readable/writable_file filter by fsuid".
The comments there say it's possible to trigger the NULL dereference
under stress.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 71185d1d310a..777e7f42b5b1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -754,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-
-	if (file->private_data == NULL) {
-		rc = -EBADF;
-		FreeXid(xid);
-		return rc;
-	}
 	netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
 
 	if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1154,7 +1148,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 					bool fsuid_only)
 {
 	struct cifsFileInfo *open_file;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+	struct cifs_sb_info *cifs_sb;
 	bool any_available = false;
 	int rc;
 
@@ -1168,6 +1162,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 		return NULL;
 	}
 
+	cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
 	/* only filter by fsuid on multiuser mounts */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
 		fsuid_only = false;
-- 
cgit v1.2.3


From c67236281c5d749741f5414103903a7c1b9c4636 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Wed, 3 Nov 2010 10:58:57 +0300
Subject: cifs: make cifs_set_oplock_level() take a cifsInodeInfo pointer

All the callers already have a pointer to struct cifsInodeInfo. Use it.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    |  3 +--
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/file.c      |  8 ++++----
 fs/cifs/misc.c      | 16 +++++++++-------
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38526a6c4acf..9c3789762ab7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -321,8 +321,7 @@ cifs_alloc_inode(struct super_block *sb)
 	/* Until the file is open and we have gotten oplock
 	info back from the server, can not assume caching of
 	file data or metadata */
-	cifs_inode->clientCanCacheRead = false;
-	cifs_inode->clientCanCacheAll = false;
+	cifs_set_oplock_level(cifs_inode, 0);
 	cifs_inode->delete_pending = false;
 	cifs_inode->invalid_mapping = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7f050f4fc3d9..7ed69b6b5fe6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,7 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 				      int offset);
-extern void cifs_set_oplock_level(struct inode *inode, __u32 oplock);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
 
 extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
 				struct file *file, struct tcon_link *tlink,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 777e7f42b5b1..06c3e83fa387 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -146,7 +146,7 @@ client_can_cache:
 		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
 					 xid, NULL);
 
-	cifs_set_oplock_level(inode, oplock);
+	cifs_set_oplock_level(pCifsInode, oplock);
 
 	return rc;
 }
@@ -248,7 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
 	spin_unlock(&cifs_file_list_lock);
 
-	cifs_set_oplock_level(inode, oplock);
+	cifs_set_oplock_level(pCifsInode, oplock);
 
 	file->private_data = pCifsFile;
 	return pCifsFile;
@@ -279,7 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	if (list_empty(&cifsi->openFileList)) {
 		cFYI(1, "closing last open instance for inode %p",
 			cifs_file->dentry->d_inode);
-		cifs_set_oplock_level(inode, 0);
+		cifs_set_oplock_level(cifsi, 0);
 	}
 	spin_unlock(&cifs_file_list_lock);
 
@@ -611,7 +611,7 @@ reopen_success:
 	     we can not go to the server to get the new inod
 	     info */
 
-	cifs_set_oplock_level(inode, oplock);
+	cifs_set_oplock_level(pCifsInode, oplock);
 
 	cifs_relock_file(pCifsFile);
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d3b9ddebc17e..43f10281bc19 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -570,7 +570,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				cFYI(1, "file id match, oplock break");
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 
-				cifs_set_oplock_level(netfile->dentry->d_inode,
+				cifs_set_oplock_level(pCifsInode,
 						      pSMB->OplockLevel);
 				/*
 				 * cifs_oplock_break_put() can't be called
@@ -722,18 +722,20 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 	}
 }
 
-void cifs_set_oplock_level(struct inode *inode, __u32 oplock)
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 {
-	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	oplock &= 0xF;
 
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+	if (oplock == OPLOCK_EXCLUSIVE) {
 		cinode->clientCanCacheAll = true;
 		cinode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ) {
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+		     &cinode->vfs_inode);
+	} else if (oplock == OPLOCK_READ) {
 		cinode->clientCanCacheAll = false;
 		cinode->clientCanCacheRead = true;
-		cFYI(1, "Level II Oplock granted on inode %p", inode);
+		cFYI(1, "Level II Oplock granted on inode %p",
+		    &cinode->vfs_inode);
 	} else {
 		cinode->clientCanCacheAll = false;
 		cinode->clientCanCacheRead = false;
-- 
cgit v1.2.3


From f7ad6d2e9201a6e1c9ee6530a291452eb695feb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:43:33 -0500
Subject: ext4: handle writeback of inodes which are being freed

The following BUG can occur when an inode which is getting freed when
it still has dirty pages outstanding, and it gets deleted (in this
because it was the target of a rename).  In ordered mode, we need to
make sure the data pages are written just in case we crash before the
rename (or unlink) is committed.  If the inode is being freed then
when we try to igrab the inode, we end up tripping the BUG_ON at
fs/ext4/page-io.c:146.

To solve this problem, we need to keep track of the number of io
callbacks which are pending, and avoid destroying the inode until they
have all been completed.  That way we don't have to bump the inode
count to keep the inode from being destroyed; an approach which
doesn't work because the count could have already been dropped down to
zero before the inode writeback has started (at which point we're not
allowed to bump the count back up to 1, since it's already started
getting freed).

Thanks to Dave Chinner for suggesting this approach, which is also
used by XFS.

  kernel BUG at /scratch_space/linux-2.6/fs/ext4/page-io.c:146!
  Call Trace:
   [<ffffffff811075b1>] ext4_bio_write_page+0x172/0x307
   [<ffffffff811033a7>] mpage_da_submit_io+0x2f9/0x37b
   [<ffffffff811068d7>] mpage_da_map_and_submit+0x2cc/0x2e2
   [<ffffffff811069b3>] mpage_add_bh_to_extent+0xc6/0xd5
   [<ffffffff81106c66>] write_cache_pages_da+0x2a4/0x3ac
   [<ffffffff81107044>] ext4_da_writepages+0x2d6/0x44d
   [<ffffffff81087910>] do_writepages+0x1c/0x25
   [<ffffffff810810a4>] __filemap_fdatawrite_range+0x4b/0x4d
   [<ffffffff810815f5>] filemap_fdatawrite_range+0xe/0x10
   [<ffffffff81122a2e>] jbd2_journal_begin_ordered_truncate+0x7b/0xa2
   [<ffffffff8110615d>] ext4_evict_inode+0x57/0x24c
   [<ffffffff810c14a3>] evict+0x22/0x92
   [<ffffffff810c1a3d>] iput+0x212/0x249
   [<ffffffff810bdf16>] dentry_iput+0xa1/0xb9
   [<ffffffff810bdf6b>] d_kill+0x3d/0x5d
   [<ffffffff810be613>] dput+0x13a/0x147
   [<ffffffff810b990d>] sys_renameat+0x1b5/0x258
   [<ffffffff81145f71>] ? _atomic_dec_and_lock+0x2d/0x4c
   [<ffffffff810b2950>] ? cp_new_stat+0xde/0xea
   [<ffffffff810b29c1>] ? sys_newlstat+0x2d/0x38
   [<ffffffff810b99c6>] sys_rename+0x16/0x18
   [<ffffffff81002a2b>] system_call_fastpath+0x16/0x1b

Reported-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Tested-by: Nick Bowler <nbowler@elliptictech.com>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/page-io.c | 59 ++++++++++++++++++++++++++++++++-----------------------
 fs/ext4/super.c   |  2 ++
 3 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f82..670d1343f914 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -858,6 +858,7 @@ struct ext4_inode_info {
 	spinlock_t i_completed_io_lock;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 
 	/*
 	 * Transactions that contain inode's metadata needed to complete
@@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d976..a24c8cca7370 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
 
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
 
+#define WQ_HASH_SZ		37
+#define to_ioend_wq(v)	(&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+
 int __init ext4_init_pageio(void)
 {
+	int i;
+
 	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
 	if (io_page_cachep == NULL)
 		return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
 		kmem_cache_destroy(io_page_cachep);
 		return -ENOMEM;
 	}
+	for (i = 0; i < WQ_HASH_SZ; i++)
+		init_waitqueue_head(&ioend_wq[i]);
 
 	return 0;
 }
@@ -52,9 +60,17 @@ void ext4_exit_pageio(void)
 	kmem_cache_destroy(io_page_cachep);
 }
 
+void ext4_ioend_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = to_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
 void ext4_free_io_end(ext4_io_end_t *io)
 {
 	int i;
+	wait_queue_head_t *wq;
 
 	BUG_ON(!io);
 	if (io->page)
@@ -69,7 +85,10 @@ void ext4_free_io_end(ext4_io_end_t *io)
 		}
 	}
 	io->num_io_pages = 0;
-	iput(io->inode);
+	wq = to_ioend_wq(io->inode);
+	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+	    waitqueue_active(wq))
+		wake_up_all(wq);
 	kmem_cache_free(io_end_cachep, io);
 }
 
@@ -142,8 +161,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 	io = kmem_cache_alloc(io_end_cachep, flags);
 	if (io) {
 		memset(io, 0, sizeof(*io));
-		io->inode = igrab(inode);
-		BUG_ON(!io->inode);
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		io->inode = inode;
 		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
 	}
@@ -171,35 +190,15 @@ static void ext4_end_bio(struct bio *bio, int error)
 	struct workqueue_struct *wq;
 	struct inode *inode;
 	unsigned long flags;
-	ext4_fsblk_t err_block;
 	int i;
 
 	BUG_ON(!io_end);
-	inode = io_end->inode;
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = 0;
-	err_block = bio->bi_sector >> (inode->i_blkbits - 9);
 	bio_put(bio);
 
-	if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
-		pr_err("sb umounted, discard end_io request for inode %lu\n",
-			io_end->inode->i_ino);
-		ext4_free_io_end(io_end);
-		return;
-	}
-
-	if (error) {
-		io_end->flag |= EXT4_IO_END_ERROR;
-		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
-			     "(offset %llu size %ld starting block %llu)",
-			     inode->i_ino,
-			     (unsigned long long) io_end->offset,
-			     (long) io_end->size,
-			     (unsigned long long) err_block);
-	}
-
 	for (i = 0; i < io_end->num_io_pages; i++) {
 		struct page *page = io_end->pages[i]->p_page;
 		struct buffer_head *bh, *head;
@@ -254,8 +253,19 @@ static void ext4_end_bio(struct bio *bio, int error)
 		if (!partial_write)
 			SetPageUptodate(page);
 	}
-
 	io_end->num_io_pages = 0;
+	inode = io_end->inode;
+
+	if (error) {
+		io_end->flag |= EXT4_IO_END_ERROR;
+		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long)
+			     bio->bi_sector >> (inode->i_blkbits - 9));
+	}
 
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +315,6 @@ static int io_submit_init(struct ext4_io_submit *io,
 	bio->bi_private = io->io_end = io_end;
 	bio->bi_end_io = ext4_end_bio;
 
-	io_end->inode = inode;
 	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
 
 	io->io_bio = bio;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04352e9729d0..45653af88953 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->cur_aio_dio = NULL;
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
+	atomic_set(&ei->i_ioend_count, 0);
 
 	return &ei->vfs_inode;
 }
 
 static void ext4_destroy_inode(struct inode *inode)
 {
+	ext4_ioend_wait(inode);
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
 		ext4_msg(inode->i_sb, KERN_ERR,
 			 "Inode %lu (%p): orphan list check failed!",
-- 
cgit v1.2.3


From 83668e7141c7a0aa4035bde94344b81f9cf966ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:45:33 -0500
Subject: ext4: fix potential race when freeing ext4_io_page structures

Use an atomic_t and make sure we don't free the structure while we
might still be submitting I/O for that page.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/page-io.c | 38 +++++++++++++++-----------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 670d1343f914..6a5edea2d70b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,7 +177,7 @@ struct mpage_da_data {
 
 struct ext4_io_page {
 	struct page	*p_page;
-	int		p_count;
+	atomic_t	p_count;
 };
 
 #define MAX_IO_PAGES 128
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index a24c8cca7370..7f5451cd1d38 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -67,6 +67,15 @@ void ext4_ioend_wait(struct inode *inode)
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
 
+static void put_io_page(struct ext4_io_page *io_page)
+{
+	if (atomic_dec_and_test(&io_page->p_count)) {
+		end_page_writeback(io_page->p_page);
+		put_page(io_page->p_page);
+		kmem_cache_free(io_page_cachep, io_page);
+	}
+}
+
 void ext4_free_io_end(ext4_io_end_t *io)
 {
 	int i;
@@ -75,15 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
 	BUG_ON(!io);
 	if (io->page)
 		put_page(io->page);
-	for (i = 0; i < io->num_io_pages; i++) {
-		if (--io->pages[i]->p_count == 0) {
-			struct page *page = io->pages[i]->p_page;
-
-			end_page_writeback(page);
-			put_page(page);
-			kmem_cache_free(io_page_cachep, io->pages[i]);
-		}
-	}
+	for (i = 0; i < io->num_io_pages; i++)
+		put_io_page(io->pages[i]);
 	io->num_io_pages = 0;
 	wq = to_ioend_wq(io->inode);
 	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
@@ -235,13 +237,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 			} while (bh != head);
 		}
 
-		if (--io_end->pages[i]->p_count == 0) {
-			struct page *page = io_end->pages[i]->p_page;
-
-			end_page_writeback(page);
-			put_page(page);
-			kmem_cache_free(io_page_cachep, io_end->pages[i]);
-		}
+		put_io_page(io_end->pages[i]);
 
 		/*
 		 * If this is a partial write which happened to make
@@ -369,7 +365,7 @@ submit_and_retry:
 	if ((io_end->num_io_pages == 0) ||
 	    (io_end->pages[io_end->num_io_pages-1] != io_page)) {
 		io_end->pages[io_end->num_io_pages++] = io_page;
-		io_page->p_count++;
+		atomic_inc(&io_page->p_count);
 	}
 	return 0;
 }
@@ -398,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		return -ENOMEM;
 	}
 	io_page->p_page = page;
-	io_page->p_count = 0;
+	atomic_set(&io_page->p_count, 1);
 	get_page(page);
 
 	for (bh = head = page_buffers(page), block_start = 0;
@@ -430,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	 * PageWriteback bit from the page to prevent the system from
 	 * wedging later on.
 	 */
-	if (io_page->p_count == 0) {
-		put_page(page);
-		end_page_writeback(page);
-		kmem_cache_free(io_page_cachep, io_page);
-	}
+	put_io_page(io_page);
 	return ret;
 }
-- 
cgit v1.2.3


From 87009d86dc045d228e21242467a67a5f99347553 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 8 Nov 2010 13:47:33 -0500
Subject: ext4: do not try to grab the s_umount semaphore in ext4_quota_off

It's not needed to sync the filesystem, and it fixes a lock_dep complaint.

Signed-off-by: Dmitry Monakhov <dmonakhov@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/super.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 45653af88953..ee91e29ddf95 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4570,12 +4570,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-	/* Force all delayed allocation blocks to be allocated */
-	if (test_opt(sb, DELALLOC)) {
-		down_read(&sb->s_umount);
+	/* Force all delayed allocation blocks to be allocated.
+	 * Caller already holds s_umount sem */
+	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
-		up_read(&sb->s_umount);
-	}
 
 	return dquot_quota_off(sb, type);
 }
-- 
cgit v1.2.3


From b56ff9d397cecdaad6c98c9d57cc6fea475e1f50 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:49:33 -0500
Subject: ext4: Don't call sb_issue_discard() in ext4_free_blocks()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 5c521830cf (ext4: Support discard requests when running in
no-journal mode) attempts to add sb_issue_discard() for data blocks
(in data=writeback mode) and in no-journal mode.  Unfortunately, this
no longer works, because in commit dd3932eddf (block: remove
BLKDEV_IFL_WAIT), sb_issue_discard() only presents a synchronous
interface, and there are times when we call ext4_free_blocks() when we
are are holding a spinlock, or are otherwise in an atomic context.

For now, I've removed the call to sb_issue_discard() to prevent a
deadlock or (if spinlock debugging is enabled) failures like this:

BUG: scheduling while atomic: rc.sysinit/1376/0x00000002
Pid: 1376, comm: rc.sysinit Not tainted 2.6.36-ARCH #1
Call Trace:
[<ffffffff810397ce>] __schedule_bug+0x5e/0x70
[<ffffffff81403110>] schedule+0x950/0xa70
[<ffffffff81060bad>] ? insert_work+0x7d/0x90
[<ffffffff81060fbd>] ? queue_work_on+0x1d/0x30
[<ffffffff81061127>] ? queue_work+0x37/0x60
[<ffffffff8140377d>] schedule_timeout+0x21d/0x360
[<ffffffff812031c3>] ? generic_make_request+0x2c3/0x540
[<ffffffff81402680>] wait_for_common+0xc0/0x150
[<ffffffff81041490>] ? default_wake_function+0x0/0x10
[<ffffffff812034bc>] ? submit_bio+0x7c/0x100
[<ffffffff810680a0>] ? wake_bit_function+0x0/0x40
[<ffffffff814027b8>] wait_for_completion+0x18/0x20
[<ffffffff8120a969>] blkdev_issue_discard+0x1b9/0x210
[<ffffffff811ba03e>] ext4_free_blocks+0x68e/0xb60
[<ffffffff811b1650>] ? __ext4_handle_dirty_metadata+0x110/0x120
[<ffffffff811b098c>] ext4_ext_truncate+0x8cc/0xa70
[<ffffffff810d713e>] ? pagevec_lookup+0x1e/0x30
[<ffffffff81191618>] ext4_truncate+0x178/0x5d0
[<ffffffff810eacbb>] ? unmap_mapping_range+0xab/0x280
[<ffffffff810d8976>] vmtruncate+0x56/0x70
[<ffffffff811925cb>] ext4_setattr+0x14b/0x460
[<ffffffff811319e4>] notify_change+0x194/0x380
[<ffffffff81117f80>] do_truncate+0x60/0x90
[<ffffffff811e08fa>] ? security_inode_permission+0x1a/0x20
[<ffffffff811eaec1>] ? tomoyo_path_truncate+0x11/0x20
[<ffffffff81127539>] do_last+0x5d9/0x770
[<ffffffff811278bd>] do_filp_open+0x1ed/0x680
[<ffffffff8140644f>] ? page_fault+0x1f/0x30
[<ffffffff81132bfc>] ? alloc_fd+0xec/0x140
[<ffffffff81118db1>] do_sys_open+0x61/0x120
[<ffffffff81118e8b>] sys_open+0x1b/0x20
[<ffffffff81002e6b>] system_call_fastpath+0x16/0x1b

https://bugzilla.kernel.org/show_bug.cgi?id=22302

Reported-by: Mathias Burén <mathias.buren@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: jiayingz@google.com
---
 fs/ext4/mballoc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724a..5b4d4e3a4d58 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,8 +4640,6 @@ do_more:
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, block_group, bit, count);
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
-- 
cgit v1.2.3


From 7ff9c073dd4d7200399076554f7ab9b876f196f6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:51:33 -0500
Subject: ext4: Add new ext4 inode tracepoints

Add ext4_evict_inode, ext4_drop_inode, ext4_mark_inode_dirty, and
ext4_begin_ordered_truncate()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c             |  3 ++
 fs/ext4/super.c             | 10 +++++
 include/trace/events/ext4.h | 97 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 191616470466..846e1e9db434 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
+	trace_ext4_begin_ordered_truncate(inode, new_size);
 	return jbd2_journal_begin_ordered_truncate(
 					EXT4_SB(inode->i_sb)->s_journal,
 					&EXT4_I(inode)->jinode,
@@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
 	handle_t *handle;
 	int err;
 
+	trace_ext4_evict_inode(inode);
 	if (inode->i_nlink) {
 		truncate_inode_pages(&inode->i_data, 0);
 		goto no_delete;
@@ -5649,6 +5651,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	int err, ret;
 
 	might_sleep();
+	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee91e29ddf95..61182fe6254e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
+static int ext4_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext4_drop_inode(inode, drop);
+	return drop;
+}
+
 static void ext4_destroy_inode(struct inode *inode)
 {
 	ext4_ioend_wait(inode);
@@ -1175,6 +1183,7 @@ static const struct super_operations ext4_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
 	.put_super	= ext4_put_super,
 	.sync_fs	= ext4_sync_fs,
@@ -1196,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
 	.write_super	= ext4_write_super,
 	.put_super	= ext4_put_super,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 289010d3270b..e5e345fb2a5c 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -98,6 +98,103 @@ TRACE_EVENT(ext4_allocate_inode,
 		  (unsigned long) __entry->dir, __entry->mode)
 );
 
+TRACE_EVENT(ext4_evict_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	int,   dev_major                )
+		__field(	int,   dev_minor                )
+		__field(	ino_t,	ino			)
+		__field(	int,	nlink			)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor = MINOR(inode->i_sb->s_dev);
+		__entry->ino	= inode->i_ino;
+		__entry->nlink	= inode->i_nlink;
+	),
+
+	TP_printk("dev %d,%d ino %lu nlink %d",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino, __entry->nlink)
+);
+
+TRACE_EVENT(ext4_drop_inode,
+	TP_PROTO(struct inode *inode, int drop),
+
+	TP_ARGS(inode, drop),
+
+	TP_STRUCT__entry(
+		__field(	int,	dev_major		)
+		__field(	int,	dev_minor		)
+		__field(	ino_t,	ino			)
+		__field(	int,	drop			)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor = MINOR(inode->i_sb->s_dev);
+		__entry->ino	= inode->i_ino;
+		__entry->drop	= drop;
+	),
+
+	TP_printk("dev %d,%d ino %lu drop %d",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino, __entry->drop)
+);
+
+TRACE_EVENT(ext4_mark_inode_dirty,
+	TP_PROTO(struct inode *inode, unsigned long IP),
+
+	TP_ARGS(inode, IP),
+
+	TP_STRUCT__entry(
+		__field(	int,	dev_major		)
+		__field(	int,	dev_minor		)
+		__field(	ino_t,	ino			)
+		__field(unsigned long,	ip			)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor = MINOR(inode->i_sb->s_dev);
+		__entry->ino	= inode->i_ino;
+		__entry->ip	= IP;
+	),
+
+	TP_printk("dev %d,%d ino %lu caller %pF",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino, (void *)__entry->ip)
+);
+
+TRACE_EVENT(ext4_begin_ordered_truncate,
+	TP_PROTO(struct inode *inode, loff_t new_size),
+
+	TP_ARGS(inode, new_size),
+
+	TP_STRUCT__entry(
+		__field(	int,	dev_major		)
+		__field(	int,	dev_minor		)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	new_size		)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major	= MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor	= MINOR(inode->i_sb->s_dev);
+		__entry->ino		= inode->i_ino;
+		__entry->new_size	= new_size;
+	),
+
+	TP_printk("dev %d,%d ino %lu new_size %lld",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino,
+		  (long long) __entry->new_size)
+);
+
 DECLARE_EVENT_CLASS(ext4__write_begin,
 
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-- 
cgit v1.2.3


From 618763958b2291a09057dbfa553da6ded93dcfad Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 8 Nov 2010 07:28:32 -0500
Subject: cifs: make cifs_ioctl handle NULL filp->private_data correctly

Commit 13cfb7334e made cifs_ioctl use the tlink attached to the
cifsFileInfo for a filp. This ignores the case of an open directory
however, which in CIFS can have a NULL private_data until a readdir
is done on it.

This patch re-adds the NULL pointer checks that were removed in commit
50ae28f01 and moves the setting of tcon and "caps" variables lower.

Long term, a better fix would be to establish a f_op->open routine for
directories that populates that field at open time, but that requires
some other changes to how readdir calls are handled.

Reported-by: Kjell Rune Skaaraas <kjella79@yahoo.no>
Reviewed-and-Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/ioctl.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 2fa22f20cfc5..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
 	struct cifsFileInfo *pSMBFile = filep->private_data;
-	struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
+	struct cifsTconInfo *tcon;
 	__u64	ExtAttrBits = 0;
 	__u64	ExtAttrMask = 0;
-	__u64   caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	__u64   caps;
 #endif /* CONFIG_CIFS_POSIX */
 
 	xid = GetXid();
@@ -62,6 +62,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 			break;
 #ifdef CONFIG_CIFS_POSIX
 		case FS_IOC_GETFLAGS:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
 				rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
 					&ExtAttrBits, &ExtAttrMask);
@@ -73,6 +77,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 			break;
 
 		case FS_IOC_SETFLAGS:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
 				if (get_user(ExtAttrBits, (int __user *)arg)) {
 					rc = -EFAULT;
-- 
cgit v1.2.3


From 0e15482566b752718e7225168380904f1d0cdfa3 Mon Sep 17 00:00:00 2001
From: Meelis Roos <mroos@linux.ee>
Date: Mon, 8 Nov 2010 13:38:14 -0800
Subject: sparc: fix openpromfs compile

Fix openpromfs compilation by adding a missing semicolon in
fs/openpromfs/inode.c openprom_mount().

Signed-off-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/openpromfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ddb1f41376e5..911e61f348fc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -418,7 +418,7 @@ out_no_root:
 static struct dentry *openprom_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_single(fs_type, flags, data, openprom_fill_super)
+	return mount_single(fs_type, flags, data, openprom_fill_super);
 }
 
 static struct file_system_type openprom_fs_type = {
-- 
cgit v1.2.3


From 3565bd46b1c6a3dbf1f670d3275aa4018a4c65ae Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Tue, 9 Nov 2010 12:27:41 +0530
Subject: cifs: fix a memleak in cifs_setattr_nounix()

Andrew Hendry reported a kmemleak warning in 2.6.37-rc1 while editing a
text file with gedit over cifs.

unreferenced object 0xffff88022ee08b40 (size 32):
  comm "gedit", pid 2524, jiffies 4300160388 (age 2633.655s)
  hex dump (first 32 bytes):
    5c 2e 67 6f 75 74 70 75 74 73 74 72 65 61 6d 2d  \.goutputstream-
    35 42 41 53 4c 56 00 de 09 00 00 00 2c 26 78 ee  5BASLV......,&x.
  backtrace:
    [<ffffffff81504a4d>] kmemleak_alloc+0x2d/0x60
    [<ffffffff81136e13>] __kmalloc+0xe3/0x1d0
    [<ffffffffa0313db0>] build_path_from_dentry+0xf0/0x230 [cifs]
    [<ffffffffa031ae1e>] cifs_setattr+0x9e/0x770 [cifs]
    [<ffffffff8115fe90>] notify_change+0x170/0x2e0
    [<ffffffff81145ceb>] sys_fchmod+0x10b/0x140
    [<ffffffff8100c172>] system_call_fastpath+0x16/0x1b
    [<ffffffffffffffff>] 0xffffffffffffffff

The commit 1025774c that removed inode_setattr() seems to have introduced this
memleak by returning early without freeing 'full_path'.

Reported-by: Andrew Hendry <andrew.hendry@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 39869c3c3efb..ef3a55bf86b6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2177,7 +2177,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	setattr_copy(inode, attrs);
 	mark_inode_dirty(inode);
-	return 0;
 
 cifs_setattr_exit:
 	kfree(full_path);
-- 
cgit v1.2.3


From f3f63c1c28bc861a931fac283b5bc3585efb8967 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 29 Oct 2010 11:46:56 -0600
Subject: block: limit vec count in bio_kmalloc() and bio_alloc_map_data()

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..8317a2c106bc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
 	struct bio *bio;
 
+	if (nr_iovecs > UIO_MAXIOV)
+		return NULL;
+
 	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
 		      gfp_mask);
 	if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
 static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
 					       gfp_t gfp_mask)
 {
-	struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
+	struct bio_map_data *bmd;
+
+	if (iov_count > UIO_MAXIOV)
+		return NULL;
 
+	bmd = kmalloc(sizeof(*bmd), gfp_mask);
 	if (!bmd)
 		return NULL;
 
-- 
cgit v1.2.3


From cb4644cac4a2797afc847e6c92736664d4b0ea34 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 10 Nov 2010 14:36:25 +0100
Subject: bio: take care not overflow page count when mapping/copying user data

If the iovec is being set up in a way that causes uaddr + PAGE_SIZE
to overflow, we could end up attempting to map a huge number of
pages. Check for this invalid input type.

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 8317a2c106bc..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -834,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		start = uaddr >> PAGE_SHIFT;
 
+		/*
+		 * Overflow, abort
+		 */
+		if (end < start)
+			return ERR_PTR(-EINVAL);
+
 		nr_pages += end - start;
 		len += iov[i].iov_len;
 	}
@@ -962,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		unsigned long start = uaddr >> PAGE_SHIFT;
 
+		/*
+		 * Overflow, abort
+		 */
+		if (end < start)
+			return ERR_PTR(-EINVAL);
+
 		nr_pages += end - start;
 		/*
 		 * buffer must be aligned to at least hardsector size for now
@@ -989,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 		unsigned long start = uaddr >> PAGE_SHIFT;
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
-		
+
 		ret = get_user_pages_fast(uaddr, local_nr_pages,
 				write_to_vm, &pages[cur_page]);
 		if (ret < local_nr_pages) {
-- 
cgit v1.2.3


From 1447399b3e34af016c368b4178db7ef0e04e15b0 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Tue, 9 Nov 2010 21:33:02 +0100
Subject: ioprio: fix RCU locking around task dereference

With 2.6.37-rc1, I observe sys_ioprio_set not taking the RCU lock [1]
across access to the task credentials.

Inspecting the code in fs/ioprio.c, the tasklist_lock is held for read
across the __task_cred call, which is presumably sufficient to prevent
the task credentials becoming stale.

===================================================

[ INFO: suspicious rcu_dereference_check() usage. ]

---------------------------------------------------

kernel/pid.c:419 invoked rcu_dereference_check() without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 1

1 lock held by start-stop-daem/2246:

 #0:  (tasklist_lock){.?.?..}, at: [<ffffffff811a2dfa>]
sys_ioprio_set+0x8a/0x400

stack backtrace:

Pid: 2246, comm: start-stop-daem Not tainted 2.6.37-rc1-330cd+ #2

Call Trace:

 [<ffffffff8109f5f4>] lockdep_rcu_dereference+0xa4/0xc0

 [<ffffffff81085651>] find_task_by_pid_ns+0x81/0x90

 [<ffffffff8108567d>] find_task_by_vpid+0x1d/0x20

 [<ffffffff811a3160>] sys_ioprio_set+0x3f0/0x400

 [<ffffffff816efa79>] ? trace_hardirqs_on_thunk+0x3a/0x3f

 [<ffffffff81003482>] system_call_fastpath+0x16/0x1b

Take the RCU lock for read across acquiring the pointer to the task
credentials and dereferencing it.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>

Fixed up by Jens to fix missing rcu_read_unlock() on mismatches.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ioprio.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..8def14e24c37 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -139,7 +139,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				if (__task_cred(p)->uid != who)
+				int match;
+
+				rcu_read_lock();
+				match = __task_cred(p)->uid == who;
+				rcu_read_unlock();
+				if (!match)
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -232,7 +237,12 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 				break;
 
 			do_each_thread(g, p) {
-				if (__task_cred(p)->uid != user->uid)
+				int match;
+
+				rcu_read_lock();
+				match = __task_cred(p)->uid == user->uid;
+				rcu_read_unlock();
+				if (!match)
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
-- 
cgit v1.2.3


From f85acd81aa623e3dcf268c90e5cd8ecf36830984 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 9 Nov 2010 21:26:56 +0100
Subject: ioprio: rcu_read_lock/unlock protect find_task_by_vpid call (V2)

Commit 4221a9918e38b7494cee341dda7b7b4bb8c04bde "Add RCU check for
find_task_by_vpid()" introduced rcu_lockdep_assert to find_task_by_pid_ns=

Assertion failed in sys_ioprio_get. The patch is fixing assertion
failure in ioprio_set as well.

 kernel/pid.c:419 invoked rcu_dereference_check() without protection!

 stack backtrace:
 Pid: 4254, comm: iotop Not tainted
 Call Trace:
 [<ffffffff810656f2>] lockdep_rcu_dereference+0xaa/0xb2
 [<ffffffff81053c67>] find_task_by_pid_ns+0x4f/0x68
 [<ffffffff81053c9d>] find_task_by_vpid+0x1d/0x1f
 [<ffffffff811104e2>] sys_ioprio_get+0x50/0x2da
 [<ffffffff81002182>] system_call_fastpath+0x16/0x1b

V2: rcu critical section expanded according to comment by Paul E. McKenney

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ioprio.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 8def14e24c37..2f7d05c89922 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -111,12 +111,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 	read_lock(&tasklist_lock);
 	switch (which) {
 		case IOPRIO_WHO_PROCESS:
+			rcu_read_lock();
 			if (!who)
 				p = current;
 			else
 				p = find_task_by_vpid(who);
 			if (p)
 				ret = set_task_ioprio(p, ioprio);
+			rcu_read_unlock();
 			break;
 		case IOPRIO_WHO_PGRP:
 			if (!who)
@@ -205,12 +207,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 	read_lock(&tasklist_lock);
 	switch (which) {
 		case IOPRIO_WHO_PROCESS:
+			rcu_read_lock();
 			if (!who)
 				p = current;
 			else
 				p = find_task_by_vpid(who);
 			if (p)
 				ret = get_task_ioprio(p);
+			rcu_read_unlock();
 			break;
 		case IOPRIO_WHO_PGRP:
 			if (!who)
-- 
cgit v1.2.3


From 5d0af85cd0964bb845b63d5059bb20e8f7731e65 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 28 Oct 2010 21:37:10 +0000
Subject: xfs: remove experimental tag from the delaylog option

We promised to do this for 2.6.37, and the code looks stable enough to
keep that promise.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 Documentation/filesystems/xfs-delayed-logging-design.txt | 11 -----------
 fs/xfs/linux-2.6/xfs_super.c                             |  3 ---
 2 files changed, 14 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index 96d0df28bed3..7445bf335dae 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -794,17 +794,6 @@ designed.
 
 Roadmap:
 
-2.6.37 Remove experimental tag from mount option
-	=> should be roughly 6 months after initial merge
-	=> enough time to:
-		=> gain confidence and fix problems reported by early
-		   adopters (a.k.a. guinea pigs)
-		=> address worst performance regressions and undesired
-		   behaviours
-		=> start tuning/optimising code for parallelism
-		=> start tuning/optimising algorithms consuming
-		   excessive CPU time
-
 2.6.39 Switch default mount option to use delayed logging
 	=> should be roughly 12 months after initial merge
 	=> enough time to shake out remaining problems before next round of
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae4..064f964d4f3c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
 			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
 		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
 			mp->m_flags |= XFS_MOUNT_DELAYLOG;
-			cmn_err(CE_WARN,
-				"Enabling EXPERIMENTAL delayed logging feature "
-				"- use at your own risk.\n");
 		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
 			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, "ihashsize")) {
-- 
cgit v1.2.3


From 6762b938eac878a30a90e770ac655874c36bc642 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 30 Oct 2010 14:26:17 +0000
Subject: xfs: xfs_ioctl: fix information leak to userland

al_hreq is copied from userland.  If al_hreq.buflen is not properly aligned
then xfs_attr_list will ignore the last bytes of kbuf.  These bytes are
unitialized.  It leads to leaking of contents of kernel stack memory.

Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38e..ad442d9e392e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+	kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
 	if (!kbuf)
 		goto out_dput;
 
-- 
cgit v1.2.3


From f83282a8ef799c0bdcb0c32971487087da1bc216 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Nov 2010 08:55:04 +0000
Subject: xfs: fix per-ag reference counting in inode reclaim tree walking

The walk fails to decrement the per-ag reference count when the
non-blocking walk fails to obtain the per-ag reclaim lock, leading
to an assert failure on debug kernels when unmounting a filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 1 +
 fs/xfs/xfs_mount.c          | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981d..afb0d7cfad1c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -853,6 +853,7 @@ restart:
 		if (trylock) {
 			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
 				skipped++;
+				xfs_perag_put(pag);
 				continue;
 			}
 			first_index = pag->pag_ici_reclaim_cursor;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a399..19e9dfa1c254 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
-- 
cgit v1.2.3


From bfe2741967eaa3434fa9b3d8f24b1422d4540e7d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Nov 2010 08:55:05 +0000
Subject: xfs: move delayed write buffer trace

The delayed write buffer split trace currently issues a trace for
every buffer it scans. These buffers are not necessarily queued for
delayed write. Indeed, when buffers are pinned, there can be
thousands of traces of buffers that aren't actually queued for
delayed write and the ones that are are lost in the noise. Move the
trace point to record only buffers that are split out for IO to be
issued on.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb57..aa1d353def29 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1781,7 +1781,6 @@ xfs_buf_delwri_split(
 	INIT_LIST_HEAD(list);
 	spin_lock(dwlk);
 	list_for_each_entry_safe(bp, n, dwq, b_list) {
-		trace_xfs_buf_delwri_split(bp, _RET_IP_);
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
 		if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1794,7 @@ xfs_buf_delwri_split(
 					 _XBF_RUN_QUEUES);
 			bp->b_flags |= XBF_WRITE;
 			list_move_tail(&bp->b_list, list);
+			trace_xfs_buf_delwri_split(bp, _RET_IP_);
 		} else
 			skipped++;
 	}
-- 
cgit v1.2.3


From 785ce41805ea7b6a9b2775ed9f4cf10cd7a90c03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 6 Nov 2010 11:42:44 +0000
Subject: xfs: tell lockdep about parent iolock usage in filestreams

The filestreams code may take the iolock on the parent inode while
holding it on a child.  This is the only place in XFS where we take
both the child and parent iolock, so just telling lockdep about it
is enough.  The lock flag required for that was already added as
part of the ilock lockdep annotations and unused so far.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_filestream.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
 	 * If the file's parent directory is known, take its iolock in exclusive
 	 * mode to prevent two sibling files from racing each other to migrate
 	 * themselves and their parent to different AGs.
+	 *
+	 * Note that we lock the parent directory iolock inside the child
+	 * iolock here.  That's fine as we never hold both parent and child
+	 * iolock in any other place.  This is different from the ilock,
+	 * which requires locking of the child after the parent for namespace
+	 * operations.
 	 */
 	if (pip)
-		xfs_ilock(pip, XFS_IOLOCK_EXCL);
+		xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 
 	/*
 	 * A new AG needs to be found for the file.  If the file's parent
-- 
cgit v1.2.3


From 5d2bf8a55e03b0e59ed5a4ac2ff7f9ee3ba7e40d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 6 Nov 2010 11:42:56 +0000
Subject: xfs: fix a few compiler warnings with CONFIG_XFS_QUOTA=n

Andi Kleen reported that gcc-4.5 gives lots of warnings for him
inside the XFS code.  It turned out most of them are due to the
quota stubs beeing macros, and gcc now complaining about macros
evaluating to 0 that are not assigned to variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_quota.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)	(0)
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)	(0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+		struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+	return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+		struct xfs_mount *mp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+	return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)					(0)
 #define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)						(0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+	return 0;
+}
 #define xfs_qm_newmount(mp, a, b)					(0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)					(0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
-- 
cgit v1.2.3


From c6f6cd0608b1826ee1797cf57a808416e4bdb806 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 6 Nov 2010 11:43:08 +0000
Subject: xfs: use hlist_add_fake

XFS does not need it's inodes to actuall be hashed in the VFS inode
cache, but we require the inode to be marked hashed for the
writeback code to work.

Insted of using insert_inode_hash, which requires a second
inode_lock roundtrip after the partial merge of the inode
scalability patches in 2.6.37-rc simply use the new hlist_add_fake
helper to mark it hashed without requiring a lock or touching a
global cache line.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c61..94d5fd6a2973 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -762,7 +762,8 @@ xfs_setup_inode(
 	inode->i_state = I_NEW;
 
 	inode_sb_list_add(inode);
-	insert_inode_hash(inode);
+	/* make the inode look hashed for the writeback code */
+	hlist_add_fake(&inode->i_hash);
 
 	inode->i_mode	= ip->i_d.di_mode;
 	inode->i_nlink	= ip->i_d.di_nlink;
-- 
cgit v1.2.3


From 3df057ac9afe83c4af84016df3baf3a0eb1d3d33 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 3 Nov 2010 16:49:44 -0400
Subject: locks: fix leak on merging leases

We must also free the passed-in lease in the case it wasn't used because
an existing lease was upgrade/downgraded or already existed.

Note the nfsd caller doesn't care because it's fl_change callback
returns an error in those cases.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 65765cb6afed..61c22f722050 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1504,7 +1504,7 @@ static int do_fcntl_delete_lease(struct file *filp)
 
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
-	struct file_lock *fl;
+	struct file_lock *fl, *ret;
 	struct fasync_struct *new;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int error;
@@ -1518,6 +1518,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		locks_free_lock(fl);
 		return -ENOMEM;
 	}
+	ret = fl;
 	lock_flocks();
 	error = __vfs_setlease(filp, arg, &fl);
 	if (error) {
@@ -1525,6 +1526,8 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		locks_free_lock(fl);
 		goto out_free_fasync;
 	}
+	if (ret != fl)
+		locks_free_lock(fl);
 
 	/*
 	 * fasync_insert_entry() returns the old entry if any.
@@ -1532,7 +1535,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 	 * inserted it into the fasync list. Clear new so that
 	 * we don't release it here.
 	 */
-	if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+	if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
 		new = NULL;
 
 	if (error < 0) {
-- 
cgit v1.2.3


From 8896b93f42459b18b145c69d399b62870df48061 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 3 Nov 2010 18:09:18 -0400
Subject: locks: remove dead lease error-handling code

A minor oversight from f7347ce4ee7c65415f84be915c018473e7076f31,
"fasync: re-organize fasync entry insertion to allow it under a
spinlock": this cleanup-on-error was only needed to handle -ENOMEM.  Now
that we're preallocating it's unneeded.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 61c22f722050..0e62dd35d088 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1506,7 +1506,6 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
 	struct file_lock *fl, *ret;
 	struct fasync_struct *new;
-	struct inode *inode = filp->f_path.dentry->d_inode;
 	int error;
 
 	fl = lease_alloc(filp, arg);
@@ -1520,7 +1519,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 	}
 	ret = fl;
 	lock_flocks();
-	error = __vfs_setlease(filp, arg, &fl);
+	error = __vfs_setlease(filp, arg, &ret);
 	if (error) {
 		unlock_flocks();
 		locks_free_lock(fl);
@@ -1538,14 +1537,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 	if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
 		new = NULL;
 
-	if (error < 0) {
-		/* remove lease just inserted by setlease */
-		fl->fl_type = F_UNLCK | F_INPROGRESS;
-		fl->fl_break_time = jiffies - 10;
-		time_out_leases(inode);
-	} else {
-		error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	}
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 	unlock_flocks();
 
 out_free_fasync:
-- 
cgit v1.2.3


From ece413f59f257682de4a2e2e42af33b016af53f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 10 Nov 2010 21:39:11 +0000
Subject: xfs: remove incorrect assert in xfs_vm_writepage

In commit 20cb52ebd1b5ca6fa8a5d9b6b1392292f5ca8a45, titled
"xfs: simplify xfs_vm_writepage" I added an assert that any !mapped and
uptodate buffers are not dirty.  That asserts turns out to trigger a lot
when running fsx on filesystems with small block sizes.  The reason for
that is that the assert is simply incorrect.  !mapped and uptodate
just mean this buffer covers a hole, and whenever we do a set_page_dirty
we mark all blocks in the page dirty, no matter if they have data or
not.  So remove the assert, and update the comment above the condition
to match reality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd7..7d287afccde5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1111,11 +1111,12 @@ xfs_vm_writepage(
 			uptodate = 0;
 
 		/*
-		 * A hole may still be marked uptodate because discard_buffer
-		 * leaves the flag set.
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			ASSERT(!buffer_dirty(bh));
 			imap_valid = 0;
 			continue;
 		}
-- 
cgit v1.2.3


From 52ca0e84b05595cf74f1ff772b3f9807256b1b27 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 11 Nov 2010 14:05:20 -0800
Subject: hugetlbfs: lessen the impact of a deprecation warning

WARN_ONCE is a bit strong for a deprecation warning, given that it spews a
huge backtrace.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d6cfac1f0a40..a5fe68189eed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
 	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
 		*user = current_user();
 		if (user_shm_lock(size, *user)) {
-			WARN_ONCE(1,
-			  "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+			printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
 		} else {
 			*user = NULL;
 			return ERR_PTR(-EPERM);
-- 
cgit v1.2.3


From 1c66b360fe26204e2aa14e45086b4a6b8890b1a2 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Sat, 13 Nov 2010 16:22:02 +0800
Subject: ocfs2: Change some lock status member in ocfs2_lock_res to char.

Commit 83fd9c7 changes l_level, l_requested and l_blocking of
ocfs2_lock_res from int to unsigned char. But actually it is
initially as -1(ocfs2_lock_res_init_common) which
correspoding to 255 for unsigned char. So the whole dlm lock
mechanism doesn't work now which means a disaster to ocfs2.

Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/ocfs2.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3bd..1efea3615589 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
 	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	unsigned char            l_level;
+	char			 l_level;
+	char			 l_requested;
+	char			 l_blocking;
 
 	/* Data packed - type enum ocfs2_lock_type */
 	unsigned char            l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
 	unsigned char            l_action;
 	/* Data packed - enum type ocfs2_unlock_action */
 	unsigned char            l_unlock_action;
-	unsigned char            l_requested;
-	unsigned char            l_blocking;
 	unsigned int             l_pending_gen;
 
 	spinlock_t               l_lock;
-- 
cgit v1.2.3


From 044b9414c7caf9a26192c73a5b88fa1a8a32a1c1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Nov 2010 20:01:07 +0000
Subject: GFS2: Fix inode deallocation race

This area of the code has always been a bit delicate due to the
subtleties of lock ordering. The problem is that for "normal"
alloc/dealloc, we always grab the inode locks first and the rgrp lock
later.

In order to ensure no races in looking up the unlinked, but still
allocated inodes, we need to hold the rgrp lock when we do the lookup,
which means that we can't take the inode glock.

The solution is to borrow the technique already used by NFS to solve
what is essentially the same problem (given an inode number, look up
the inode carefully, checking that it really is in the expected
state).

We cannot do that directly from the allocation code (lock ordering
again) so we give the job to the pre-existing delete workqueue and
carry on with the allocation as normal.

If we find there is no space, we do a journal flush (required anyway
if space from a deallocation is to be released) which should block
against the pending deallocations, so we should always get the space
back.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/export.c |  46 ++---------------
 fs/gfs2/glock.c  |  21 ++++----
 fs/gfs2/inode.c  | 152 +++++++++++++------------------------------------------
 fs/gfs2/inode.h  |   4 +-
 fs/gfs2/rgrp.c   |  91 +++++++++++++++++----------------
 5 files changed, 98 insertions(+), 216 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d34..5ab3839dfcb9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 				      struct gfs2_inum_host *inum)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_holder i_gh;
 	struct inode *inode;
 	struct dentry *dentry;
-	int error;
 
 	inode = gfs2_ilookup(sb, inum->no_addr);
 	if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 		goto out_inode;
 	}
 
-	error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
-				  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return ERR_PTR(error);
-
-	error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
-	if (error)
-		goto fail;
-
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto fail;
-	}
-
-	error = gfs2_inode_refresh(GFS2_I(inode));
-	if (error) {
-		iput(inode);
-		goto fail;
-	}
-
-	/* Pick up the works we bypass in gfs2_inode_lookup */
-	if (inode->i_state & I_NEW) 
-		gfs2_set_iop(inode);
-
-	if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-		iput(inode);
-		goto fail;
-	}
-
-	error = -EIO;
-	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-		iput(inode);
-		goto fail;
-	}
-
-	gfs2_glock_dq_uninit(&i_gh);
+	inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
+				    GFS2_BLKST_DINODE);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
 out_inode:
 	dentry = d_obtain_alias(inode);
 	if (!IS_ERR(dentry))
 		dentry->d_op = &gfs2_dops;
 	return dentry;
-fail:
-	gfs2_glock_dq_uninit(&i_gh);
-	return ERR_PTR(error);
 }
 
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f099..f92c17704169 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work)
 {
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct gfs2_inode *ip = NULL;
+	struct gfs2_inode *ip;
 	struct inode *inode;
-	u64 no_addr = 0;
+	u64 no_addr = gl->gl_name.ln_number;
+
+	ip = gl->gl_object;
+	/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
 
-	spin_lock(&gl->gl_spin);
-	ip = (struct gfs2_inode *)gl->gl_object;
 	if (ip)
-		no_addr = ip->i_no_addr;
-	spin_unlock(&gl->gl_spin);
-	if (ip) {
 		inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-		if (inode) {
-			d_prune_aliases(inode);
-			iput(inode);
-		}
+	else
+		inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+	if (inode && !IS_ERR(inode)) {
+		d_prune_aliases(inode);
+		iput(inode);
 	}
 	gfs2_glock_put(gl);
 }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8cf..e1213f7f9217 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
 	return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
 
-struct gfs2_skip_data {
-	u64	no_addr;
-	int	skipped;
-};
-
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_skip_data *data = opaque;
-
-	if (ip->i_no_addr == data->no_addr) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE)){
-			data->skipped = 1;
-			return 0;
-		}
-		return 1;
-	}
-	return 0;
-}
-
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_skip_data *data = opaque;
-
-	if (data->skipped)
-		return 1;
-	inode->i_ino = (unsigned long)(data->no_addr);
-	ip->i_no_addr = data->no_addr;
-	return 0;
-}
-
-static struct inode *gfs2_iget_skip(struct super_block *sb,
-				    u64 no_addr)
-{
-	struct gfs2_skip_data data;
-	unsigned long hash = (unsigned long)no_addr;
-
-	data.no_addr = no_addr;
-	data.skipped = 0;
-	return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
-
 /**
  * GFS2 lookup code fills in vfs inode contents based on info obtained
  * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
 	return ERR_PTR(error);
 }
 
-/**
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
- *                               and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+				  u64 *no_formal_ino, unsigned int blktype)
 {
-	struct gfs2_sbd *sdp;
-	struct gfs2_inode *ip;
-	struct gfs2_glock *io_gl = NULL;
-	int error;
-	struct gfs2_holder gh;
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_holder i_gh;
 	struct inode *inode;
+	int error;
 
-	inode = gfs2_iget_skip(sb, no_addr);
-
-	if (!inode)
-		return;
-
-	/* If it's not a new inode, someone's using it, so leave it alone. */
-	if (!(inode->i_state & I_NEW)) {
-		iput(inode);
-		return;
-	}
-
-	ip = GFS2_I(inode);
-	sdp = GFS2_SB(inode);
-	ip->i_no_formal_ino = -1;
+	error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+				  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return ERR_PTR(error);
 
-	error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
-	if (unlikely(error))
+	error = gfs2_check_blk_type(sdp, no_addr, blktype);
+	if (error)
 		goto fail;
-	ip->i_gl->gl_object = ip;
 
-	error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
-	if (unlikely(error))
-		goto fail_put;
-
-	set_bit(GIF_INVALID, &ip->i_flags);
-	error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
-				   &ip->i_iopen_gh);
-	if (unlikely(error))
-		goto fail_iopen;
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+	if (IS_ERR(inode))
+		goto fail;
 
-	ip->i_iopen_gh.gh_gl->gl_object = ip;
-	gfs2_glock_put(io_gl);
-	io_gl = NULL;
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error)
+		goto fail_iput;
 
-	inode->i_mode = DT2IF(DT_UNKNOWN);
+	/* Pick up the works we bypass in gfs2_inode_lookup */
+	if (inode->i_state & I_NEW) 
+		gfs2_set_iop(inode);
 
-	/*
-	 * We must read the inode in order to work out its type in
-	 * this case. Note that this doesn't happen often as we normally
-	 * know the type beforehand. This code path only occurs during
-	 * unlinked inode recovery (where it is safe to do this glock,
-	 * which is not true in the general case).
-	 */
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
-				   &gh);
-	if (unlikely(error))
-		goto fail_glock;
+	/* Two extra checks for NFS only */
+	if (no_formal_ino) {
+		error = -ESTALE;
+		if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
+			goto fail_iput;
 
-	/* Inode is now uptodate */
-	gfs2_glock_dq_uninit(&gh);
-	gfs2_set_iop(inode);
+		error = -EIO;
+		if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
+			goto fail_iput;
 
-	/* The iput will cause it to be deleted. */
-	iput(inode);
-	return;
+		error = 0;
+	}
 
-fail_glock:
-	gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
-	if (io_gl)
-		gfs2_glock_put(io_gl);
-fail_put:
-	ip->i_gl->gl_object = NULL;
-	gfs2_glock_put(ip->i_gl);
 fail:
-	iget_failed(inode);
-	return;
+	gfs2_glock_dq_uninit(&i_gh);
+	return error ? ERR_PTR(error) : inode;
+fail_iput:
+	iput(inode);
+	goto fail;
 }
 
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc6..d8499fadcc53 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
 				       u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+					 u64 *no_formal_ino,
+					 unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c1..33c8407b876f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
  *          The inode, if one has been found, in inode.
  */
 
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-			   u64 skip)
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
 {
 	u32 goal = 0, block;
 	u64 no_addr;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	unsigned int n;
+	struct gfs2_glock *gl;
+	struct gfs2_inode *ip;
+	int error;
+	int found = 0;
 
-	for(;;) {
-		if (goal >= rgd->rd_data)
-			break;
+	while (goal < rgd->rd_data) {
 		down_write(&sdp->sd_log_flush_lock);
 		n = 1;
 		block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
 		if (no_addr == skip)
 			continue;
 		*last_unlinked = no_addr;
-		return no_addr;
+
+		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+		if (error)
+			continue;
+
+		/* If the inode is already in cache, we can ignore it here
+		 * because the existing inode disposal code will deal with
+		 * it when all refs have gone away. Accessing gl_object like
+		 * this is not safe in general. Here it is ok because we do
+		 * not dereference the pointer, and we only need an approx
+		 * answer to whether it is NULL or not.
+		 */
+		ip = gl->gl_object;
+
+		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+			gfs2_glock_put(gl);
+		else
+			found++;
+
+		/* Limit reclaim to sensible number of tasks */
+		if (found > 2*NR_CPUS)
+			return;
 	}
 
 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
-	return 0;
+	return;
 }
 
 /**
@@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
  * Try to acquire rgrp in way which avoids contending with others.
  *
  * Returns: errno
- *          unlinked: the block address of an unlinked block to be reclaimed
  */
 
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
-			  u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
 	int loops = 0;
 	int error, rg_locked;
 
-	*unlinked = 0;
 	rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
 
 	while (rgd) {
@@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
-			/* If the rg came in already locked, there's no
-			   way we can recover from a failed try_rgrp_unlink
-			   because that would require an iput which can only
-			   happen after the rgrp is unlocked. */
-			if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-				*unlinked = try_rgrp_unlink(rgd, last_unlinked,
-							   ip->i_no_addr);
+			if (rgd->rd_flags & GFS2_RDF_CHECK)
+				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			if (*unlinked)
-				return -EAGAIN;
 			/* fall through */
 		case GLR_TRYFAILED:
 			rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
-			if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-				*unlinked = try_rgrp_unlink(rgd, last_unlinked,
-							    ip->i_no_addr);
+			if (rgd->rd_flags & GFS2_RDF_CHECK)
+				try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			if (*unlinked)
-				return -EAGAIN;
 			break;
 
 		case GLR_TRYFAILED:
@@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
 	int error = 0;
-	u64 last_unlinked = NO_BLOCK, unlinked;
+	u64 last_unlinked = NO_BLOCK;
+	int tries = 0;
 
 	if (gfs2_assert_warn(sdp, al->al_requested))
 		return -EINVAL;
 
-try_again:
 	if (hold_rindex) {
 		/* We need to hold the rindex unless the inode we're using is
 		   the rindex itself, in which case it's already held. */
@@ -1218,31 +1227,23 @@ try_again:
 		else if (!sdp->sd_rgrps) /* We may not have the rindex read
 					    in, so: */
 			error = gfs2_ri_update_special(ip);
+		if (error)
+			return error;
 	}
 
-	if (error)
-		return error;
+	do {
+		error = get_local_rgrp(ip, &last_unlinked);
+		/* If there is no space, flushing the log may release some */
+		if (error)
+			gfs2_log_flush(sdp, NULL);
+	} while (error && tries++ < 3);
 
-	/* Find an rgrp suitable for allocation.  If it encounters any unlinked
-	   dinodes along the way, error will equal -EAGAIN and unlinked will
-	   contains it block address. We then need to look up that inode and
-	   try to free it, and try the allocation again. */
-	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
 	if (error) {
 		if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
-		if (error != -EAGAIN)
-			return error;
-
-		gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
-		/* regardless of whether or not gfs2_process_unlinked_inode
-		   was successful, we don't want to repeat it again. */
-		last_unlinked = unlinked;
-		gfs2_log_flush(sdp, NULL);
-		error = 0;
-
-		goto try_again;
+		return error;
 	}
+
 	/* no error, so we have the rgrp set in the inode's allocation. */
 	al->al_file = file;
 	al->al_line = line;
-- 
cgit v1.2.3