From 06f9da6e826a0b459652b98a21541bca274bd440 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 12 Nov 2013 15:06:52 -0800
Subject: fs/ocfs2: remove unnecessary variable bits_wanted from
 ocfs2_calc_extend_credits

Code cleanup to remove unnecessary variable passed but never used
to ocfs2_calc_extend_credits.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c         |  3 +--
 fs/ocfs2/dir.c          |  4 ++--
 fs/ocfs2/file.c         |  3 +--
 fs/ocfs2/journal.h      |  3 +--
 fs/ocfs2/move_extents.c |  3 +--
 fs/ocfs2/refcounttree.c |  9 +++------
 fs/ocfs2/xattr.c        | 20 +++++++-------------
 7 files changed, 16 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f37d3c0e2053..c0d4797618ef 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1802,8 +1802,7 @@ try_again:
 			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 
 		credits = ocfs2_calc_extend_credits(inode->i_sb,
-						    &di->id2.i_list,
-						    clusters_to_alloc);
+						    &di->id2.i_list);
 
 	}
 
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 30544ce8e9f7..93b079115edc 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3284,7 +3284,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		if (ocfs2_dir_resv_allowed(osb))
 			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
 
-		credits = ocfs2_calc_extend_credits(sb, el, 1);
+		credits = ocfs2_calc_extend_credits(sb, el);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -3716,7 +3716,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
 {
 	int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
 
-	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
 	credits += ocfs2_quota_trans_credits(osb->sb);
 	return credits;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d71903c6068b..67c037af301a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -603,8 +603,7 @@ restart_all:
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
-					    clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0b479bab3671..9ff4e8cf9d97 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -524,8 +524,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
  * the result may be wrong.
  */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_extent_list *root_el,
-					    u32 bits_wanted)
+					    struct ocfs2_extent_list *root_el)
 {
 	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
 
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 3d3f3c83065c..445678d9163c 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -201,8 +201,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 		}
 	}
 
-	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
-					      clusters_to_move + 2);
+	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
 
 	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
 	     extra_blocks, clusters_to_move, *credits);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bf4dfc14bb2c..0022192e7ac8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2502,8 +2502,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
 		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
 		*credits += ocfs2_calc_extend_credits(sb,
-						      et.et_root_el,
-						      ref_blocks);
+						      et.et_root_el);
 	} else {
 		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
 		*meta_add += 1;
@@ -2874,8 +2873,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
 		meta_add =
 			ocfs2_extend_meta_needed(et->et_root_el);
 
-	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
-					      num_clusters + 2);
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
 
 	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
 					       p_cluster, num_clusters,
@@ -3625,8 +3623,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
 
 		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
 		*credits += ocfs2_calc_extend_credits(inode->i_sb,
-						      et.et_root_el,
-						      ref_blocks);
+						      et.et_root_el);
 	}
 
 out:
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6ce0686eab72..aaaab8ba82ab 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -754,8 +754,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 			BUG_ON(why == RESTART_META);
 
 			credits = ocfs2_calc_extend_credits(inode->i_sb,
-							    &vb->vb_xv->xr_list,
-							    clusters_to_add);
+							    &vb->vb_xv->xr_list);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
 				status = -ENOMEM;
@@ -3040,8 +3039,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 			clusters_add += new_clusters;
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
-							&def_xv.xv.xr_list,
-							new_clusters);
+							&def_xv.xv.xr_list);
 		}
 
 		goto meta_guess;
@@ -3106,8 +3104,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 			if (!ocfs2_xattr_is_local(xe))
 				credits += ocfs2_calc_extend_credits(
 							inode->i_sb,
-							&def_xv.xv.xr_list,
-							new_clusters);
+							&def_xv.xv.xr_list);
 			goto out;
 		}
 	}
@@ -3132,9 +3129,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
 			clusters_add += new_clusters - old_clusters;
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
-							     &xv->xr_list,
-							     new_clusters -
-							     old_clusters);
+							     &xv->xr_list);
 			if (value_size >= OCFS2_XATTR_ROOT_SIZE)
 				goto out;
 		}
@@ -3180,7 +3175,7 @@ meta_guess:
 				 &xb->xb_attrs.xb_root.xt_list;
 			meta_add += ocfs2_extend_meta_needed(el);
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
-							     el, 1);
+							     el);
 		} else
 			credits += OCFS2_SUBALLOC_ALLOC + 1;
 
@@ -6216,8 +6211,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
 			  le16_to_cpu(xv->xr_list.l_next_free_rec);
 
 		*credits += ocfs2_calc_extend_credits(sb,
-						&def_xv.xv.xr_list,
-						le32_to_cpu(xv->xr_clusters));
+						&def_xv.xv.xr_list);
 
 		/*
 		 * If the value is a tree with depth > 1, We don't go deep
@@ -6782,7 +6776,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
 		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
 
 	*credits += ocfs2_calc_extend_credits(osb->sb,
-					      xt_et->et_root_el, len);
+					      xt_et->et_root_el);
 
 	if (metas.num_metas) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
-- 
cgit v1.2.3


From f0cb0f0bca233935ac70707d47f5885419a7fd31 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Tue, 12 Nov 2013 15:06:53 -0800
Subject: fs/ocfs2/file.c: fix wrong comment

Unwritten extent only exists for file systems which support holes.  But
the comment said was opposite meaning and also the comment is not very
clear, so rephase it.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 67c037af301a..6fff128cad16 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -580,7 +580,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	int did_quota = 0;
 
 	/*
-	 * This function only exists for file systems which don't
+	 * Unwritten extent only exists for file systems which
 	 * support holes.
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-- 
cgit v1.2.3


From 7391a294b861bf2c3b762dfdcf61b9c5f1bffa1f Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Tue, 12 Nov 2013 15:06:54 -0800
Subject: ocfs2: return ENOMEM when sb_getblk() fails

The only reason for sb_getblk() failing is if it can't allocate the
buffer_head.  So return ENOMEM instead when it fails.

[joseph.qi@huawei.com: ocfs2_symlink_get_block() and ocfs2_read_blocks_sync() and ocfs2_read_blocks() need the same change]
Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c          | 2 +-
 fs/ocfs2/aops.c           | 2 ++
 fs/ocfs2/buffer_head_io.c | 4 ++--
 fs/ocfs2/dir.c            | 8 ++++----
 fs/ocfs2/namei.c          | 2 +-
 fs/ocfs2/refcounttree.c   | 6 +++---
 fs/ocfs2/suballoc.c       | 4 ++--
 fs/ocfs2/super.c          | 4 ++--
 fs/ocfs2/xattr.c          | 2 +-
 9 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 17e6bdde96c5..dc7411fe185d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1025,7 +1025,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 		for(i = count;  i < (num_got + count); i++) {
 			bhs[i] = sb_getblk(osb->sb, first_blkno);
 			if (bhs[i] == NULL) {
-				status = -EIO;
+				status = -ENOMEM;
 				mlog_errno(status);
 				goto bail;
 			}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c0d4797618ef..c20360002f29 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -80,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
+		err = -ENOMEM;
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
 		     "%llu\n", (unsigned long long)iblock);
 		goto bail;
@@ -92,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 			    iblock;
 		buffer_cache_bh = sb_getblk(osb->sb, blkno);
 		if (!buffer_cache_bh) {
+			err = -ENOMEM;
 			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
 			goto bail;
 		}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5d18ad10c27f..5b704c63a103 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -115,7 +115,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 		if (bhs[i] == NULL) {
 			bhs[i] = sb_getblk(osb->sb, block++);
 			if (bhs[i] == NULL) {
-				status = -EIO;
+				status = -ENOMEM;
 				mlog_errno(status);
 				goto bail;
 			}
@@ -214,7 +214,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 			bhs[i] = sb_getblk(sb, block++);
 			if (bhs[i] == NULL) {
 				ocfs2_metadata_cache_io_unlock(ci);
-				status = -EIO;
+				status = -ENOMEM;
 				mlog_errno(status);
 				goto bail;
 			}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 93b079115edc..91a7e85ac8fd 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2349,7 +2349,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 
 	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
 	if (dx_root_bh == NULL) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		goto out;
 	}
 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
@@ -2422,7 +2422,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
 	for (i = 0; i < num_dx_leaves; i++) {
 		bh = sb_getblk(osb->sb, start_blk + i);
 		if (bh == NULL) {
-			ret = -EIO;
+			ret = -ENOMEM;
 			goto out;
 		}
 		dx_leaves[i] = bh;
@@ -2929,7 +2929,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 	dirdata_bh = sb_getblk(sb, blkno);
 	if (!dirdata_bh) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out_commit;
 	}
@@ -3159,7 +3159,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 
 	*new_bh = sb_getblk(sb, p_blkno);
 	if (!*new_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be3f8676a438..4f791f6d27d0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -489,7 +489,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 
 	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
 	if (!*new_fe_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto leave;
 	}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 0022192e7ac8..70d083e60d40 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1310,7 +1310,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 
 	new_bh = sb_getblk(sb, blkno);
 	if (new_bh == NULL) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1561,7 +1561,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 
 	new_bh = sb_getblk(sb, blkno);
 	if (new_bh == NULL) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
@@ -3029,7 +3029,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
 	for (i = 0; i < blocks; i++, old_block++, new_block++) {
 		new_bh = sb_getblk(osb->sb, new_block);
 		if (new_bh == NULL) {
-			ret = -EIO;
+			ret = -ENOMEM;
 			mlog_errno(ret);
 			break;
 		}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5397c07ce608..2c91452c4047 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -481,7 +481,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
@@ -661,7 +661,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d4e81e4a9b04..c41492957aa5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1848,8 +1848,8 @@ static int ocfs2_get_sector(struct super_block *sb,
 
 	*bh = sb_getblk(sb, block);
 	if (!*bh) {
-		mlog_errno(-EIO);
-		return -EIO;
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
 	}
 	lock_buffer(*bh);
 	if (!buffer_dirty(*bh))
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index aaaab8ba82ab..82257299d5ea 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -377,7 +377,7 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
 					      xb_blkno + i);
 		if (!bucket->bu_bhs[i]) {
-			rc = -EIO;
+			rc = -ENOMEM;
 			mlog_errno(rc);
 			break;
 		}
-- 
cgit v1.2.3


From 58796207cf09552706b0c3e0669f2ce52ea6cee1 Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Tue, 12 Nov 2013 15:06:55 -0800
Subject: ocfs2: add necessary check in case sb_getblk() fails

sb_getblk() may return an err, so add a check for bh.

[joseph.qi@huawei.com: also add a check after calling sb_getblk() in ocfs2_create_xattr_block()]
Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/refcounttree.c | 5 +++++
 fs/ocfs2/xattr.c        | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 70d083e60d40..55767e1ba724 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -612,6 +612,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	}
 
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	if (!new_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
 	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
 
 	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 82257299d5ea..f0a1326d9bba 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2864,6 +2864,12 @@ static int ocfs2_create_xattr_block(struct inode *inode,
 	}
 
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	if (!new_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto end;
+	}
+
 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
 
 	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
-- 
cgit v1.2.3


From 910bffe0861c956096c31ad7e887cc23169955ed Mon Sep 17 00:00:00 2001
From: Joel Becker <jlbec@evilplan.org>
Date: Tue, 12 Nov 2013 15:06:57 -0800
Subject: ocfs2: don't spam on -EDQUOT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

-EDQUOT is a user-visible error, not a logic problem.  Teach mlog_errno()
to ignore it like it ignores -ENOSPC, etc.

Signed-off-by: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: Marek Królikowski <admin@wset.edu.pl>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/masklog.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index baa2b9ef7eef..2260fb9e6508 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -199,7 +199,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_errno(st) do {						\
 	int _st = (st);							\
 	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
-	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\
+	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC &&		\
+	    _st != -EDQUOT)						\
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
-- 
cgit v1.2.3


From a8f70de37bbb991033208edff7758d997d68db37 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 12 Nov 2013 15:06:58 -0800
Subject: ocfs2: use bitmap_weight()

Use bitmap_weight() instead of reinventing the wheel.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 363f0dcc924f..292a13b05b6e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -35,6 +35,7 @@
 #include <linux/time.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
+#include <linux/bitmap.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
@@ -282,15 +283,6 @@ struct o2hb_bio_wait_ctxt {
 	int               wc_error;
 };
 
-static int o2hb_pop_count(void *map, int count)
-{
-	int i = -1, pop = 0;
-
-	while ((i = find_next_bit(map, count, i + 1)) < count)
-		pop++;
-	return pop;
-}
-
 static void o2hb_write_timeout(struct work_struct *work)
 {
 	int failed, quorum;
@@ -307,9 +299,9 @@ static void o2hb_write_timeout(struct work_struct *work)
 		spin_lock_irqsave(&o2hb_live_lock, flags);
 		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
 			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
-		failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+		failed = bitmap_weight(o2hb_failed_region_bitmap,
 					O2NM_MAX_REGIONS);
-		quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+		quorum = bitmap_weight(o2hb_quorum_region_bitmap,
 					O2NM_MAX_REGIONS);
 		spin_unlock_irqrestore(&o2hb_live_lock, flags);
 
@@ -765,7 +757,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
 	 * If global heartbeat active, unpin all regions if the
 	 * region count > CUT_OFF
 	 */
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
 			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
 		o2hb_region_unpin(NULL);
 unlock:
@@ -1829,7 +1821,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	live_threshold = O2HB_LIVE_THRESHOLD;
 	if (o2hb_global_heartbeat_active()) {
 		spin_lock(&o2hb_live_lock);
-		if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+		if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
 			live_threshold <<= 1;
 		spin_unlock(&o2hb_live_lock);
 	}
@@ -2180,7 +2172,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 	if (!o2hb_dependent_users)
 		goto unlock;
 
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
 			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
 		o2hb_region_pin(NULL);
 
@@ -2480,7 +2472,7 @@ static int o2hb_region_inc_user(const char *region_uuid)
 	if (o2hb_dependent_users > 1)
 		goto unlock;
 
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
 			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
 		ret = o2hb_region_pin(NULL);
 
-- 
cgit v1.2.3


From 750e3c6581304c7dff813ac2317f5d733f84cb23 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Tue, 12 Nov 2013 15:06:59 -0800
Subject: ocfs2: skip locks in the blocked list

A parallel umount on 4 nodes triggered a bug in
dlm_process_recovery_date().  Here's the situation:

Receiving MIG_LOCKRES message, A node processes the locks in migratable
lockres.  It copys lvb from migratable lockres when processing the first
valid lock.

If there is a lock in the blocked list with the EX level, it triggers the
BUG.  Since valid lvbs are set when locks are granted with EX or PR
levels, locks in the blocked list cannot have valid lvbs.  Therefore I
think we should skip the locks in the blocked list.

Signed-off-by: Xuejiufei <xuejiufei@huawei.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0b5adca1b178..7035af09cc03 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1886,6 +1886,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		if (ml->type == LKM_NLMODE)
 			goto skip_lvb;
 
+		/*
+		 * If the lock is in the blocked list it can't have a valid lvb,
+		 * so skip it
+		 */
+		if (ml->list == DLM_BLOCKED_LIST)
+			goto skip_lvb;
+
 		if (!dlm_lvb_is_empty(mres->lvb)) {
 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
 				/* other node was trying to update
-- 
cgit v1.2.3


From fae477b6f005d6bd8a6fb3f052b30d55cfa70c26 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Tue, 12 Nov 2013 15:07:00 -0800
Subject: ocfs2: delay migration when the lockres is in migration state

We trigger a bug in __dlm_lockres_reserve_ast() when we parallel umount 4
nodes.  The situation is as follows:

1) Node A migrate all lockres it owned(eg.  lockres A) to other nodes
   say node B when it umounts.

2) Receiving MIG_LOCKRES message from A, Node B masters the lockres A
   with DLM_LOCK_RES_MIGRATING state set.

3) Then we umount ocfs2 on node B.  It also should migrate lockres A to
   another node, say node C.  But now, DLM_LOCK_RES_MIGRATING state of
   lockers A is not cleared.  Node B triggered the BUG on lockres with
   state DLM_LOCK_RES_MIGRATING.

Signed-off-by: Xuejiufei <xuejiufei@huawei.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Tariq Saeed <tariq.x.saeed@oracle.com>
Cc: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index cf0f103963b1..5a0ef78fe029 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2354,6 +2354,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
 
 	assert_spin_locked(&res->spinlock);
 
+	/* delay migration when the lockres is in MIGRATING state */
+	if (res->state & DLM_LOCK_RES_MIGRATING)
+		return 0;
+
 	if (res->owner != dlm->node_num)
 		return 0;
 
-- 
cgit v1.2.3


From 518df6bdf38559247ee90b9e59a40fae779d57d0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 12 Nov 2013 15:07:01 -0800
Subject: ocfs2: use find_last_bit()

We already have find_last_bit().  So just use it as described in the
comment.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 292a13b05b6e..73920ffda05b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -946,23 +946,9 @@ out:
 	return changed;
 }
 
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
-			     int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
 {
-	int highest, node;
-
-	highest = numbits;
-	node = -1;
-	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
-		if (node >= numbits)
-			break;
-
-		highest = node;
-	}
-
-	return highest;
+	return find_last_bit(nodes, numbits);
 }
 
 static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
-- 
cgit v1.2.3


From 728b98059a2f19393f906ba4febc0f2d1d15f280 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Tue, 12 Nov 2013 15:07:02 -0800
Subject: ocfs2: break useless while loop

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 5a0ef78fe029..af3f7aa73e13 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1885,8 +1885,10 @@ ok:
 			 * up nodes that this node contacted */
 			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
 						    nn+1)) < O2NM_MAX_NODES) {
-				if (nn != dlm->node_num && nn != assert->node_idx)
+				if (nn != dlm->node_num && nn != assert->node_idx) {
 					master_request = 1;
+					break;
+				}
 			}
 		}
 		mle->master = assert->node_idx;
-- 
cgit v1.2.3


From eedd40e1cad4217c9b7e2577debcdd0c48732cc3 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Tue, 12 Nov 2013 15:07:03 -0800
Subject: ocfs2: rollback transaction in ocfs2_group_add()

If ocfs2_journal_access_di() fails, group->bg_next_group should rollback.
Otherwise, there would be a inconsistency between group_bh and main_bm_bh.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/resize.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ec55add7604a..376750f7883e 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -469,6 +469,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_chain_rec *cr;
 	u16 cl_bpc;
+	u64 bg_ptr;
 
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
@@ -538,12 +539,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	}
 
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
+	bg_ptr = le64_to_cpu(group->bg_next_group);
 	group->bg_next_group = cr->c_blkno;
 	ocfs2_journal_dirty(handle, group_bh);
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
 				      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
+		group->bg_next_group = cpu_to_le64(bg_ptr);
 		mlog_errno(ret);
 		goto out_commit;
 	}
-- 
cgit v1.2.3


From 8abaae8d85d4e1de0fcc81b733ba04a5d318ba7a Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Tue, 12 Nov 2013 15:07:04 -0800
Subject: ocfs2: do not call brelse() if group_bh is not initialized in
 ocfs2_group_add()

If group_bh is not initialized, there is no need to release.  This
problem does not cause anything wrong, but the patch would make the code
more logical.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/resize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 376750f7883e..822ebc10f281 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_unlock;
+		goto out_free_group_bh;
 	}
 
 	trace_ocfs2_group_add((unsigned long long)input->group,
@@ -524,7 +524,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	if (IS_ERR(handle)) {
 		mlog_errno(PTR_ERR(handle));
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free_group_bh;
 	}
 
 	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
@@ -577,8 +577,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
-out_unlock:
+
+out_free_group_bh:
 	brelse(group_bh);
+
+out_unlock:
 	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
-- 
cgit v1.2.3


From bfbca926d67c9bf3212d1077c6afd48d2995a285 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Tue, 12 Nov 2013 15:07:05 -0800
Subject: ocfs2: add missing errno in ocfs2_ioctl_move_extents()

If the file is not regular or writeable, it should return errno(EPERM).

This patch is based on 85a258b70d ("ocfs2: fix error handling in
ocfs2_ioctl_move_extents()").

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/move_extents.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 445678d9163c..631a98213474 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1066,8 +1066,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 	if (status)
 		return status;
 
-	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+		status = -EPERM;
 		goto out_drop;
+	}
 
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		status = -EPERM;
@@ -1089,8 +1091,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 		goto out_free;
 	}
 
-	if (range.me_start > i_size_read(inode))
+	if (range.me_start > i_size_read(inode)) {
+		status = -EINVAL;
 		goto out_free;
+	}
 
 	if (range.me_start + range.me_len > i_size_read(inode))
 			range.me_len = i_size_read(inode) - range.me_start;
-- 
cgit v1.2.3


From b1214e4757b7d5fcea483b927e130361d41430a5 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Tue, 12 Nov 2013 15:07:06 -0800
Subject: ocfs2: fix possible double free in ocfs2_write_begin_nolock

When ocfs2_write_cluster_by_desc() failed in ocfs2_write_begin_nolock()
because of ENOSPC, it goes to out_quota, freeing data_ac(meta_ac).  Then
it calls ocfs2_try_to_free_truncate_log() to free space.  If enough
space freed, it will try to write again.  Unfortunately, some error
happenes before ocfs2_lock_allocators(), it goes to out and free
data_ac(meta_ac) again.

Signed-off-by: joyce <xuejiufei@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c20360002f29..f959a1532767 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1898,10 +1898,14 @@ out_commit:
 out:
 	ocfs2_free_write_ctxt(wc);
 
-	if (data_ac)
+	if (data_ac) {
 		ocfs2_free_alloc_context(data_ac);
-	if (meta_ac)
+		data_ac = NULL;
+	}
+	if (meta_ac) {
 		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
 
 	if (ret == -ENOSPC && try_free) {
 		/*
-- 
cgit v1.2.3


From d00d2f8ab9f20f8e6a0fc3804847b418498d80b8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 12 Nov 2013 15:07:07 -0800
Subject: ocfs2: convert use of typedef ctl_table to struct ctl_table

This typedef is unnecessary and should just be removed.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stackglue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 39abf89697ed..cb7ec0b63ddc 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -643,7 +643,7 @@ error:
 
 #define FS_OCFS2_NM		1
 
-static ctl_table ocfs2_nm_table[] = {
+static struct ctl_table ocfs2_nm_table[] = {
 	{
 		.procname	= "hb_ctl_path",
 		.data		= ocfs2_hb_ctl_path,
@@ -654,7 +654,7 @@ static ctl_table ocfs2_nm_table[] = {
 	{ }
 };
 
-static ctl_table ocfs2_mod_table[] = {
+static struct ctl_table ocfs2_mod_table[] = {
 	{
 		.procname	= "nm",
 		.data		= NULL,
@@ -665,7 +665,7 @@ static ctl_table ocfs2_mod_table[] = {
 	{ }
 };
 
-static ctl_table ocfs2_kern_table[] = {
+static struct ctl_table ocfs2_kern_table[] = {
 	{
 		.procname	= "ocfs2",
 		.data		= NULL,
@@ -676,7 +676,7 @@ static ctl_table ocfs2_kern_table[] = {
 	{ }
 };
 
-static ctl_table ocfs2_root_table[] = {
+static struct ctl_table ocfs2_root_table[] = {
 	{
 		.procname	= "fs",
 		.data		= NULL,
-- 
cgit v1.2.3


From 41ecc345984bcc8bf341a3e758c1eb3fc543dd83 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Nov 2013 15:07:08 -0800
Subject: ocfs2: simplify ocfs2_invalidatepage() and ocfs2_releasepage()

Ocfs2 doesn't do data journalling.  Thus its ->invalidatepage and
->releasepage functions never get called on buffers that have journal
heads attached.  So just use standard variants of functions from
buffer.c.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f959a1532767..aeb44e879c51 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,26 +594,11 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	ocfs2_rw_unlock(inode, level);
 }
 
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3.  PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
-				 unsigned int length)
-{
-	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
-	jbd2_journal_invalidatepage(journal, page, offset, length);
-}
-
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
-	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
 	if (!page_has_buffers(page))
 		return 0;
-	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	return try_to_free_buffers(page);
 }
 
 static ssize_t ocfs2_direct_IO(int rw,
@@ -2092,7 +2077,7 @@ const struct address_space_operations ocfs2_aops = {
 	.write_end		= ocfs2_write_end,
 	.bmap			= ocfs2_bmap,
 	.direct_IO		= ocfs2_direct_IO,
-	.invalidatepage		= ocfs2_invalidatepage,
+	.invalidatepage		= block_invalidatepage,
 	.releasepage		= ocfs2_releasepage,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
-- 
cgit v1.2.3


From 83285c72e08c42848808039ef2d3b67a1bb88832 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Tue, 12 Nov 2013 15:07:19 -0800
Subject: mm: use pgdat_end_pfn() to simplify the code in others

Use "pgdat_end_pfn()" instead of "pgdat->node_start_pfn +
pgdat->node_spanned_pages".  Simplify the code, no functional change.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c     | 3 +--
 mm/bootmem.c        | 2 +-
 mm/memory_hotplug.c | 9 ++++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 06ea155e1a59..5ed0e52d6aa0 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -255,8 +255,7 @@ static int kcore_update_ram(void)
 	end_pfn = 0;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long node_end;
-		node_end  = NODE_DATA(nid)->node_start_pfn +
-			NODE_DATA(nid)->node_spanned_pages;
+		node_end = node_end_pfn(nid);
 		if (end_pfn < node_end)
 			end_pfn = node_end;
 	}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692e..95b528cd4de7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
 	/* update goal according ...MAX_DMA32_PFN */
-	end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	end_pfn = pgdat_end_pfn(pgdat);
 
 	if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
 	    (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..375a42d76b2c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -365,8 +365,7 @@ out_fail:
 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 			    unsigned long end_pfn)
 {
-	unsigned long old_pgdat_end_pfn =
-		pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
 
 	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
 		pgdat->node_start_pfn = start_pfn;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 static void shrink_pgdat_span(struct pglist_data *pgdat,
 			      unsigned long start_pfn, unsigned long end_pfn)
 {
-	unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
-	unsigned long pgdat_end_pfn =
-		pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
+	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
+	unsigned long pgdat_end_pfn = p;
 	unsigned long pfn;
 	struct mem_section *ms;
 	int nid = pgdat->node_id;
-- 
cgit v1.2.3


From 948927ee9e4f35f287e61a79c9f0e85ca2202c7d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 12 Nov 2013 15:07:28 -0800
Subject: mm, mempolicy: make mpol_to_str robust and always succeed

mpol_to_str() should not fail.  Currently, it either fails because the
string buffer is too small or because a string hasn't been defined for a
mempolicy mode.

If a new mempolicy mode is introduced and no string is defined for it,
just warn and return "unknown".

If the buffer is too small, just truncate the string and return, the
same behavior as snprintf().

This also fixes a bug where there was no NULL-byte termination when doing
*p++ = '=' and *p++ ':' and maxlen has been reached.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Chen Gang <gang.chen@asianux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c        | 14 ++++++-------
 include/linux/mempolicy.h |  5 ++---
 mm/mempolicy.c            | 52 +++++++++++++++--------------------------------
 3 files changed, 24 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 390bdab01c3c..9f1369fe0afb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1387,8 +1387,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {};
 	struct mempolicy *pol;
-	int n;
-	char buffer[50];
+	char buffer[64];
+	int nid;
 
 	if (!mm)
 		return 0;
@@ -1404,10 +1404,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.mm = mm;
 
 	pol = get_vma_policy(task, vma, vma->vm_start);
-	n = mpol_to_str(buffer, sizeof(buffer), pol);
+	mpol_to_str(buffer, sizeof(buffer), pol);
 	mpol_cond_put(pol);
-	if (n < 0)
-		return n;
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1460,9 +1458,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	if (md->writeback)
 		seq_printf(m, " writeback=%lu", md->writeback);
 
-	for_each_node_state(n, N_MEMORY)
-		if (md->node[n])
-			seq_printf(m, " N%d=%lu", n, md->node[n]);
+	for_each_node_state(nid, N_MEMORY)
+		if (md->node[nid])
+			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
 out:
 	seq_putc(m, '\n');
 
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ea4d2495c646..9fe426b30a41 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -169,7 +169,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 extern int mpol_parse_str(char *str, struct mempolicy **mpol);
 #endif
 
-extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 
 /* Check if a vma is migratable */
 static inline int vma_migratable(struct vm_area_struct *vma)
@@ -307,9 +307,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
 }
 #endif
 
-static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
-	return 0;
 }
 
 static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71cb253368cb..260b8213a873 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2914,62 +2914,45 @@ out:
  * @maxlen:  length of @buffer
  * @pol:  pointer to mempolicy to be formatted
  *
- * Convert a mempolicy into a string.
- * Returns the number of characters in buffer (if positive)
- * or an error (negative)
+ * Convert @pol into a string.  If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
+ * longest flag, "relative", and to display at least a few node ids.
  */
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
 	char *p = buffer;
-	int l;
-	nodemask_t nodes;
-	unsigned short mode;
-	unsigned short flags = pol ? pol->flags : 0;
-
-	/*
-	 * Sanity check:  room for longest mode, flag and some nodes
-	 */
-	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+	nodemask_t nodes = NODE_MASK_NONE;
+	unsigned short mode = MPOL_DEFAULT;
+	unsigned short flags = 0;
 
-	if (!pol || pol == &default_policy)
-		mode = MPOL_DEFAULT;
-	else
+	if (pol && pol != &default_policy) {
 		mode = pol->mode;
+		flags = pol->flags;
+	}
 
 	switch (mode) {
 	case MPOL_DEFAULT:
-		nodes_clear(nodes);
 		break;
-
 	case MPOL_PREFERRED:
-		nodes_clear(nodes);
 		if (flags & MPOL_F_LOCAL)
 			mode = MPOL_LOCAL;
 		else
 			node_set(pol->v.preferred_node, nodes);
 		break;
-
 	case MPOL_BIND:
-		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes = pol->v.nodes;
 		break;
-
 	default:
-		return -EINVAL;
+		WARN_ON_ONCE(1);
+		snprintf(p, maxlen, "unknown");
+		return;
 	}
 
-	l = strlen(policy_modes[mode]);
-	if (buffer + maxlen < p + l + 1)
-		return -ENOSPC;
-
-	strcpy(p, policy_modes[mode]);
-	p += l;
+	p += snprintf(p, maxlen, policy_modes[mode]);
 
 	if (flags & MPOL_MODE_FLAGS) {
-		if (buffer + maxlen < p + 2)
-			return -ENOSPC;
-		*p++ = '=';
+		p += snprintf(p, buffer + maxlen - p, "=");
 
 		/*
 		 * Currently, the only defined flags are mutually exclusive
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	}
 
 	if (!nodes_empty(nodes)) {
-		if (buffer + maxlen < p + 2)
-			return -ENOSPC;
-		*p++ = ':';
+		p += snprintf(p, buffer + maxlen - p, ":");
 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
 	}
-	return p - buffer;
 }
-- 
cgit v1.2.3


From ec8e41aec13005fed0dbee002fb8c99b4e001d50 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 12 Nov 2013 15:07:49 -0800
Subject: /proc/pid/smaps: show VM_SOFTDIRTY flag in VmFlags line

This flag shows that the VMA is "newly created" and thus represents
"dirty" in the task's VM.

You can clear it by "echo 4 > /proc/pid/clear_refs."

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 1 +
 fs/proc/task_mmu.c                 | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 823c95faebd2..22d89aa37218 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -460,6 +460,7 @@ manner. The codes are the following:
     nl  - non-linear mapping
     ar  - architecture specific flag
     dd  - do not include area into core dump
+    sd  - soft-dirty flag
     mm  - mixed map area
     hg  - huge page advise flag
     nh  - no-huge page advise flag
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9f1369fe0afb..abbe825d20ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -561,6 +561,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_NONLINEAR)]	= "nl",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_DONTDUMP)]	= "dd",
+#ifdef CONFIG_MEM_SOFT_DIRTY
+		[ilog2(VM_SOFTDIRTY)]	= "sd",
+#endif
 		[ilog2(VM_MIXEDMAP)]	= "mm",
 		[ilog2(VM_HUGEPAGE)]	= "hg",
 		[ilog2(VM_NOHUGEPAGE)]	= "nh",
-- 
cgit v1.2.3


From c4a391b53a72d2df4ee97f96f78c1d5971b47489 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Nov 2013 15:07:51 -0800
Subject: writeback: do not sync data dirtied after sync start

When there are processes heavily creating small files while sync(2) is
running, it can easily happen that quite some new files are created
between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2).  That can happen
especially if there are several busy filesystems (remember that sync
traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
fs before starting it on another fs).  Because WB_SYNC_ALL pass is slow
(e.g.  causes a transaction commit and cache flush for each inode in
ext3), resulting sync(2) times are rather large.

The following script reproduces the problem:

  function run_writers
  {
    for (( i = 0; i < 10; i++ )); do
      mkdir $1/dir$i
      for (( j = 0; j < 40000; j++ )); do
        dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
      done &
    done
  }

  for dir in "$@"; do
    run_writers $dir
  done

  sleep 40
  time sync

Fix the problem by disregarding inodes dirtied after sync(2) was called
in the WB_SYNC_ALL pass.  To allow for this, sync_inodes_sb() now takes
a time stamp when sync has started which is used for setting up work for
flusher threads.

To give some numbers, when above script is run on two ext4 filesystems
on simple SATA drive, the average sync time from 10 runs is 267.549
seconds with standard deviation 104.799426.  With the patched kernel,
the average sync time from 10 runs is 2.995 seconds with standard
deviation 0.096.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c                | 33 ++++++++++++++++++++++-----------
 fs/sync.c                        | 15 +++++++++------
 fs/xfs/xfs_super.c               |  2 +-
 include/linux/writeback.h        |  2 +-
 include/trace/events/writeback.h |  6 +++---
 5 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f4935b8f208..4afdbd6d9678 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -39,13 +39,18 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
-	unsigned long *older_than_this;
+	/*
+	 * Write only inodes dirtied before this time. Don't forget to set
+	 * older_than_this_is_set when you set this.
+	 */
+	unsigned long older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int older_than_this_is_set:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -246,10 +251,10 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	WARN_ON_ONCE(!work->older_than_this_is_set);
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
-		if (work->older_than_this &&
-		    inode_dirtied_after(inode, *work->older_than_this))
+		if (inode_dirtied_after(inode, work->older_than_this))
 			break;
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
@@ -733,6 +738,8 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.older_than_this = jiffies,
+		.older_than_this_is_set = 1,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -791,12 +798,13 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
 	unsigned long wb_start = jiffies;
 	long nr_pages = work->nr_pages;
-	unsigned long oldest_jif;
 	struct inode *inode;
 	long progress;
 
-	oldest_jif = jiffies;
-	work->older_than_this = &oldest_jif;
+	if (!work->older_than_this_is_set) {
+		work->older_than_this = jiffies;
+		work->older_than_this_is_set = 1;
+	}
 
 	spin_lock(&wb->list_lock);
 	for (;;) {
@@ -830,10 +838,10 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * safe.
 		 */
 		if (work->for_kupdate) {
-			oldest_jif = jiffies -
+			work->older_than_this = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
 		} else if (work->for_background)
-			oldest_jif = jiffies;
+			work->older_than_this = jiffies;
 
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
@@ -1345,18 +1353,21 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 
 /**
  * sync_inodes_sb	-	sync sb inode pages
- * @sb: the superblock
+ * @sb:			the superblock
+ * @older_than_this:	timestamp
  *
  * This function writes and waits on any dirty inode belonging to this
- * super_block.
+ * superblock that has been dirtied before given timestamp.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
+		.older_than_this = older_than_this,
+		.older_than_this_is_set = 1,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
diff --git a/fs/sync.c b/fs/sync.c
index 905f3f6b3d85..ff96f99fef64 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,10 +27,11 @@
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait,
+			     unsigned long start)
 {
 	if (wait)
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, start);
 	else
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 
@@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 int sync_filesystem(struct super_block *sb)
 {
 	int ret;
+	unsigned long start = jiffies;
 
 	/*
 	 * We need to be protected against the filesystem going from
@@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = __sync_filesystem(sb, 0);
+	ret = __sync_filesystem(sb, 0, start);
 	if (ret < 0)
 		return ret;
-	return __sync_filesystem(sb, 1);
+	return __sync_filesystem(sb, 1, start);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
 	if (!(sb->s_flags & MS_RDONLY))
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, *((unsigned long *)arg));
 }
 
 static void sync_fs_one_sb(struct super_block *sb, void *arg)
@@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 SYSCALL_DEFINE0(sync)
 {
 	int nowait = 0, wait = 1;
+	unsigned long start = jiffies;
 
 	wakeup_flusher_threads(0, WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_inodes_one_sb, &start);
 	iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15188cc99449..8968f5036fa1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,7 +918,7 @@ xfs_flush_inodes(
 	struct super_block	*sb = mp->m_super;
 
 	if (down_read_trylock(&sb->s_umount)) {
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, jiffies);
 		up_read(&sb->s_umount);
 	}
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 021b8a319b9e..fc0e4320aa6d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
-void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 464ea82e10db..c7bbbe794e65 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -287,11 +287,11 @@ TRACE_EVENT(writeback_queue_io,
 		__field(int,		reason)
 	),
 	TP_fast_assign(
-		unsigned long *older_than_this = work->older_than_this;
+		unsigned long older_than_this = work->older_than_this;
 		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
-		__entry->older	= older_than_this ?  *older_than_this : 0;
+		__entry->older	= older_than_this;
 		__entry->age	= older_than_this ?
-				  (jiffies - *older_than_this) * 1000 / HZ : -1;
+				  (jiffies - older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
 		__entry->reason	= work->reason;
 	),
-- 
cgit v1.2.3


From 00619bcc44d6b779aa366130b354153c222e4380 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 12 Nov 2013 15:08:31 -0800
Subject: mm: factor commit limit calculation

The same calculation is currently done in three differents places.
Factor that code so future changes has to be made at only one place.

[akpm@linux-foundation.org: uninline vm_commit_limit()]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  5 +----
 include/linux/mman.h |  2 ++
 mm/mmap.c            |  4 +---
 mm/nommu.c           |  3 +--
 mm/util.c            | 13 +++++++++++++
 5 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 59d85d608898..c805d5b69ba1 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -24,7 +24,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 {
 	struct sysinfo i;
 	unsigned long committed;
-	unsigned long allowed;
 	struct vmalloc_info vmi;
 	long cached;
 	unsigned long pages[NR_LRU_LISTS];
@@ -37,8 +36,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	si_meminfo(&i);
 	si_swapinfo(&i);
 	committed = percpu_counter_read_positive(&vm_committed_as);
-	allowed = ((totalram_pages - hugetlb_total_pages())
-		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
 	cached = global_page_state(NR_FILE_PAGES) -
 			total_swapcache_pages() - i.bufferram;
@@ -147,7 +144,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(global_page_state(NR_BOUNCE)),
 		K(global_page_state(NR_WRITEBACK_TEMP)),
-		K(allowed),
+		K(vm_commit_limit()),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 92dc257251e4..7f7f8dae4b1d 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -87,4 +87,6 @@ calc_vm_flag_bits(unsigned long flags)
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
+
+unsigned long vm_commit_limit(void);
 #endif /* _LINUX_MMAN_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 3d3e224be771..803048e9c568 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		goto error;
 	}
 
-	allowed = (totalram_pages - hugetlb_total_pages())
-	       	* sysctl_overcommit_ratio / 100;
+	allowed = vm_commit_limit();
 	/*
 	 * Reserve some for root
 	 */
 	if (!cap_sys_admin)
 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-	allowed += total_swap_pages;
 
 	/*
 	 * Don't let a single process grow so big a user can't recover
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f158548e..d8a957bb9e31 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		goto error;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = vm_commit_limit();
 	/*
 	 * Reserve some 3% for root
 	 */
 	if (!cap_sys_admin)
 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-	allowed += total_swap_pages;
 
 	/*
 	 * Don't let a single process grow so big a user can't recover
diff --git a/mm/util.c b/mm/util.c
index eaf63fc2c92f..f7bc2096071c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,9 @@
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
 	return mapping;
 }
 
+/*
+ * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+unsigned long vm_commit_limit(void)
+{
+	return ((totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+}
+
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-- 
cgit v1.2.3


From 54886a7153353ea2bf21ebfc1b8e030e71d151d7 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Tue, 12 Nov 2013 15:08:35 -0800
Subject: cramfs: mark as obsolete

Who needs cramfs when you have squashfs?  At least, we should warn people
that cramfs is obsolete.

Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS       | 2 +-
 fs/cramfs/Kconfig | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/MAINTAINERS b/MAINTAINERS
index 051e4dc5b70f..f5867f469f13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2373,7 +2373,7 @@ F:	kernel/cpuset.c
 
 CRAMFS FILESYSTEM
 W:	http://sourceforge.net/projects/cramfs/
-S:	Orphan
+S:	Orphan / Obsolete
 F:	Documentation/filesystems/cramfs.txt
 F:	fs/cramfs/
 
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index cd06466f365e..11b29d491b7c 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,5 +1,5 @@
 config CRAMFS
-	tristate "Compressed ROM file system support (cramfs)"
+	tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
 	depends on BLOCK
 	select ZLIB_INFLATE
 	help
@@ -16,4 +16,7 @@ config CRAMFS
 	  cramfs.  Note that the root file system (the one containing the
 	  directory /) cannot be compiled as a module.
 
+	  This filesystem is obsoleted by SquashFS, which is much better
+	  in terms of performance and features.
+
 	  If unsure, say N.
-- 
cgit v1.2.3


From 6bc080d8fdae33f4463203a400cfaa01e91701e2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 12 Nov 2013 15:10:04 -0800
Subject: debugfs: use list_next_entry() in debugfs_remove_recursive()

Change debugfs_remove_recursive() to use list_next_entry(child), no
changes in generated code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eilon Greenstein <eilong@broadcom.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/debugfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c7c83ff0f752..9c0444cccbe1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -566,8 +566,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	mutex_lock(&parent->d_inode->i_mutex);
 
 	if (child != dentry) {
-		next = list_entry(child->d_u.d_child.next, struct dentry,
-					d_u.d_child);
+		next = list_next_entry(child, d_u.d_child);
 		goto up;
 	}
 
-- 
cgit v1.2.3


From ae10b2b4eb01bedc91d29d5c5bb9e416fd806c40 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Tue, 12 Nov 2013 15:10:16 -0800
Subject: epoll: optimize EPOLL_CTL_DEL using rcu

Nathan Zimmer found that once we get over 10+ cpus, the scalability of
SPECjbb falls over due to the contention on the global 'epmutex', which is
taken in on EPOLL_CTL_ADD and EPOLL_CTL_DEL operations.

Patch #1 removes the 'epmutex' lock completely from the EPOLL_CTL_DEL path
by using rcu to guard against any concurrent traversals.

Patch #2 remove the 'epmutex' lock from EPOLL_CTL_ADD operations for
simple topologies.  IE when adding a link from an epoll file descriptor to
a wakeup source, where the epoll file descriptor is not nested.

This patch (of 2):

Optimize EPOLL_CTL_DEL such that it does not require the 'epmutex' by
converting the file->f_ep_links list into an rcu one.  In this way, we can
traverse the epoll network on the add path in parallel with deletes.
Since deletes can't create loops or worse wakeup paths, this is safe.

This patch in combination with the patch "epoll: Do not take global 'epmutex'
for simple topologies", shows a dramatic performance improvement in
scalability for SPECjbb.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Tested-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Eric Wong <normalperson@yhbt.net>
Cc: Nelson Elhage <nelhage@nelhage.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
CC: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 56 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 810c28fb8c3c..584249454822 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,6 +41,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/compat.h>
+#include <linux/rculist.h>
 
 /*
  * LOCKING:
@@ -133,8 +134,12 @@ struct nested_calls {
  * of these on a server and we do not want this to take another cache line.
  */
 struct epitem {
-	/* RB tree node used to link this structure to the eventpoll RB tree */
-	struct rb_node rbn;
+	union {
+		/* RB tree node links this structure to the eventpoll RB tree */
+		struct rb_node rbn;
+		/* Used to free the struct epitem */
+		struct rcu_head rcu;
+	};
 
 	/* List header used to link this structure to the eventpoll ready list */
 	struct list_head rdllink;
@@ -671,6 +676,12 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	return error;
 }
 
+static void epi_rcu_free(struct rcu_head *head)
+{
+	struct epitem *epi = container_of(head, struct epitem, rcu);
+	kmem_cache_free(epi_cache, epi);
+}
+
 /*
  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  * all the associated resources. Must be called with "mtx" held.
@@ -692,8 +703,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
-	if (ep_is_linked(&epi->fllink))
-		list_del_init(&epi->fllink);
+	list_del_rcu(&epi->fllink);
 	spin_unlock(&file->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
@@ -704,9 +714,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	spin_unlock_irqrestore(&ep->lock, flags);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
-
-	/* At this point it is safe to free the eventpoll item */
-	kmem_cache_free(epi_cache, epi);
+	/*
+	 * At this point it is safe to free the eventpoll item. Use the union
+	 * field epi->rcu, since we are trying to minimize the size of
+	 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
+	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
+	 * use of the rbn field.
+	 */
+	call_rcu(&epi->rcu, epi_rcu_free);
 
 	atomic_long_dec(&ep->user->epoll_watches);
 
@@ -872,7 +887,6 @@ static const struct file_operations eventpoll_fops = {
  */
 void eventpoll_release_file(struct file *file)
 {
-	struct list_head *lsthead = &file->f_ep_links;
 	struct eventpoll *ep;
 	struct epitem *epi;
 
@@ -890,17 +904,12 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-
-	while (!list_empty(lsthead)) {
-		epi = list_first_entry(lsthead, struct epitem, fllink);
-
+	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
 		ep = epi->ep;
-		list_del_init(&epi->fllink);
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
 		mutex_unlock(&ep->mtx);
 	}
-
 	mutex_unlock(&epmutex);
 }
 
@@ -1138,7 +1147,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
 	struct file *child_file;
 	struct epitem *epi;
 
-	list_for_each_entry(epi, &file->f_ep_links, fllink) {
+	/* CTL_DEL can remove links here, but that can't increase our count */
+	rcu_read_lock();
+	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
 		child_file = epi->ep->file;
 		if (is_file_epoll(child_file)) {
 			if (list_empty(&child_file->f_ep_links)) {
@@ -1160,6 +1171,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
 				"file is not an ep!\n");
 		}
 	}
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1286,7 +1298,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	/* Add the current item to the list of active epoll hook for this file */
 	spin_lock(&tfile->f_lock);
-	list_add_tail(&epi->fllink, &tfile->f_ep_links);
+	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
 	spin_unlock(&tfile->f_lock);
 
 	/*
@@ -1327,8 +1339,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 error_remove_epi:
 	spin_lock(&tfile->f_lock);
-	if (ep_is_linked(&epi->fllink))
-		list_del_init(&epi->fllink);
+	list_del_rcu(&epi->fllink);
 	spin_unlock(&tfile->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
@@ -1844,15 +1855,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 * and hang them on the tfile_check_list, so we can check that we
 	 * haven't created too many possible wakeup paths.
 	 *
-	 * We need to hold the epmutex across both ep_insert and ep_remove
-	 * b/c we want to make sure we are looking at a coherent view of
-	 * epoll network.
+	 * We need to hold the epmutex across ep_insert to prevent
+	 * multple adds from creating loops in parallel.
 	 */
-	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
+	if (op == EPOLL_CTL_ADD) {
 		mutex_lock(&epmutex);
 		did_lock_epmutex = 1;
-	}
-	if (op == EPOLL_CTL_ADD) {
 		if (is_file_epoll(tf.file)) {
 			error = -ELOOP;
 			if (ep_loop_check(ep, tf.file) != 0) {
-- 
cgit v1.2.3


From 67347fe4e6326338ee217d7eb826bedf30b2e155 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Tue, 12 Nov 2013 15:10:18 -0800
Subject: epoll: do not take global 'epmutex' for simple topologies

When calling EPOLL_CTL_ADD for an epoll file descriptor that is attached
directly to a wakeup source, we do not need to take the global 'epmutex',
unless the epoll file descriptor is nested.  The purpose of taking the
'epmutex' on add is to prevent complex topologies such as loops and deep
wakeup paths from forming in parallel through multiple EPOLL_CTL_ADD
operations.  However, for the simple case of an epoll file descriptor
attached directly to a wakeup source (with no nesting), we do not need to
hold the 'epmutex'.

This patch along with 'epoll: optimize EPOLL_CTL_DEL using rcu' improves
scalability on larger systems.  Quoting Nathan Zimmer's mail on SPECjbb
performance:

"On the 16 socket run the performance went from 35k jOPS to 125k jOPS.  In
addition the benchmark when from scaling well on 10 sockets to scaling
well on just over 40 sockets.

...

Currently the benchmark stops scaling at around 40-44 sockets but it seems like
I found a second unrelated bottleneck."

[akpm@linux-foundation.org: use `bool' for boolean variables, remove unneeded/undesirable cast of void*, add missed ep_scan_ready_list() kerneldoc]
Signed-off-by: Jason Baron <jbaron@akamai.com>
Tested-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Eric Wong <normalperson@yhbt.net>
Cc: Nelson Elhage <nelhage@nelhage.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 584249454822..f7fe7e3ce664 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -585,14 +585,14 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
  * @sproc: Pointer to the scan callback.
  * @priv: Private opaque data passed to the @sproc callback.
  * @depth: The current depth of recursive f_op->poll calls.
+ * @ep_locked: caller already holds ep->mtx
  *
  * Returns: The same integer error code returned by the @sproc callback.
  */
 static int ep_scan_ready_list(struct eventpoll *ep,
 			      int (*sproc)(struct eventpoll *,
 					   struct list_head *, void *),
-			      void *priv,
-			      int depth)
+			      void *priv, int depth, bool ep_locked)
 {
 	int error, pwake = 0;
 	unsigned long flags;
@@ -603,7 +603,9 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release_file() and epoll_ctl().
 	 */
-	mutex_lock_nested(&ep->mtx, depth);
+
+	if (!ep_locked)
+		mutex_lock_nested(&ep->mtx, depth);
 
 	/*
 	 * Steal the ready list, and re-init the original one to the
@@ -667,7 +669,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	}
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-	mutex_unlock(&ep->mtx);
+	if (!ep_locked)
+		mutex_unlock(&ep->mtx);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -822,15 +825,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	return 0;
 }
 
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
+
+struct readyevents_arg {
+	struct eventpoll *ep;
+	bool locked;
+};
+
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 {
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
+	struct readyevents_arg *arg = priv;
+
+	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
+				  call_nests + 1, arg->locked);
 }
 
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
 	int pollflags;
 	struct eventpoll *ep = file->private_data;
+	struct readyevents_arg arg;
+
+	/*
+	 * During ep_insert() we already hold the ep->mtx for the tfile.
+	 * Prevent re-aquisition.
+	 */
+	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
+	arg.ep = ep;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
@@ -842,7 +864,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	 * could re-enter here.
 	 */
 	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, ep, ep, current);
+				   ep_poll_readyevents_proc, &arg, ep, current);
 
 	return pollflags != -1 ? pollflags : 0;
 }
@@ -1243,7 +1265,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
  * Must be called with "mtx" held.
  */
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
-		     struct file *tfile, int fd)
+		     struct file *tfile, int fd, int full_check)
 {
 	int error, revents, pwake = 0;
 	unsigned long flags;
@@ -1309,7 +1331,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	/* now check if we've created too many backpaths */
 	error = -EINVAL;
-	if (reverse_path_check())
+	if (full_check && reverse_path_check())
 		goto error_remove_epi;
 
 	/* We have to drop the new item inside our item list to keep track of it */
@@ -1532,7 +1554,7 @@ static int ep_send_events(struct eventpoll *ep,
 	esed.maxevents = maxevents;
 	esed.events = events;
 
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
 }
 
 static inline struct timespec ep_set_mstimeout(long ms)
@@ -1802,11 +1824,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		struct epoll_event __user *, event)
 {
 	int error;
-	int did_lock_epmutex = 0;
+	int full_check = 0;
 	struct fd f, tf;
 	struct eventpoll *ep;
 	struct epitem *epi;
 	struct epoll_event epds;
+	struct eventpoll *tep = NULL;
 
 	error = -EFAULT;
 	if (ep_op_has_event(op) &&
@@ -1855,23 +1878,40 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 * and hang them on the tfile_check_list, so we can check that we
 	 * haven't created too many possible wakeup paths.
 	 *
-	 * We need to hold the epmutex across ep_insert to prevent
-	 * multple adds from creating loops in parallel.
+	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
+	 * the epoll file descriptor is attaching directly to a wakeup source,
+	 * unless the epoll file descriptor is nested. The purpose of taking the
+	 * 'epmutex' on add is to prevent complex toplogies such as loops and
+	 * deep wakeup paths from forming in parallel through multiple
+	 * EPOLL_CTL_ADD operations.
 	 */
+	mutex_lock_nested(&ep->mtx, 0);
 	if (op == EPOLL_CTL_ADD) {
-		mutex_lock(&epmutex);
-		did_lock_epmutex = 1;
-		if (is_file_epoll(tf.file)) {
-			error = -ELOOP;
-			if (ep_loop_check(ep, tf.file) != 0) {
-				clear_tfile_check_list();
-				goto error_tgt_fput;
+		if (!list_empty(&f.file->f_ep_links) ||
+						is_file_epoll(tf.file)) {
+			full_check = 1;
+			mutex_unlock(&ep->mtx);
+			mutex_lock(&epmutex);
+			if (is_file_epoll(tf.file)) {
+				error = -ELOOP;
+				if (ep_loop_check(ep, tf.file) != 0) {
+					clear_tfile_check_list();
+					goto error_tgt_fput;
+				}
+			} else
+				list_add(&tf.file->f_tfile_llink,
+							&tfile_check_list);
+			mutex_lock_nested(&ep->mtx, 0);
+			if (is_file_epoll(tf.file)) {
+				tep = tf.file->private_data;
+				mutex_lock_nested(&tep->mtx, 1);
 			}
-		} else
-			list_add(&tf.file->f_tfile_llink, &tfile_check_list);
+		}
+	}
+	if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) {
+		tep = tf.file->private_data;
+		mutex_lock_nested(&tep->mtx, 1);
 	}
-
-	mutex_lock_nested(&ep->mtx, 0);
 
 	/*
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
@@ -1885,10 +1925,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds.events |= POLLERR | POLLHUP;
-			error = ep_insert(ep, &epds, tf.file, fd);
+			error = ep_insert(ep, &epds, tf.file, fd, full_check);
 		} else
 			error = -EEXIST;
-		clear_tfile_check_list();
+		if (full_check)
+			clear_tfile_check_list();
 		break;
 	case EPOLL_CTL_DEL:
 		if (epi)
@@ -1904,10 +1945,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 			error = -ENOENT;
 		break;
 	}
+	if (tep != NULL)
+		mutex_unlock(&tep->mtx);
 	mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
-	if (did_lock_epmutex)
+	if (full_check)
 		mutex_unlock(&epmutex);
 
 	fdput(tf);
-- 
cgit v1.2.3


From 74a797d99a9fa4871ca978ac65e0f3a90dcf84d8 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Tue, 12 Nov 2013 15:11:06 -0800
Subject: fs/hfs/btree.h: remove duplicate defines

This patch removes duplicate defines from fs/hfs/btree.h

[akpm@linux-foundation.org: retain the comments]
Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Reviewed-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/btree.h           | 5 -----
 fs/hfsplus/hfsplus_raw.h | 8 ++++----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index 2a1d712f85dc..f6bd266d70b5 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -153,11 +153,6 @@ struct hfs_btree_header_rec {
 	u32 reserved3[16];
 } __packed;
 
-#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
-#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
-#define HFS_NODE_MAP		0x02	/* Holds part of the bitmap of used nodes */
-#define HFS_NODE_LEAF		0xFF	/* A leaf (ndNHeight==1) node */
-
 #define BTREE_ATTR_BADCLOSE	0x00000001	/* b-tree not closed properly. not
 						   used by hfsplus. */
 #define HFS_TREE_BIGKEYS	0x00000002	/* key length is u16 instead of u8.
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 452ede01b036..2c54dd81def4 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -156,10 +156,10 @@ struct hfs_bnode_desc {
 } __packed;
 
 /* HFS+ BTree node types */
-#define HFS_NODE_INDEX	0x00
-#define HFS_NODE_HEADER	0x01
-#define HFS_NODE_MAP	0x02
-#define HFS_NODE_LEAF	0xFF
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
 
 /* HFS+ BTree header */
 struct hfs_btree_header_rec {
-- 
cgit v1.2.3


From b3b5b0f03cc4ea074ed0ac110d0afd17e0ccdf9e Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 12 Nov 2013 15:11:07 -0800
Subject: hfsplus: add metadata file's clump size calculation functionality

There are situation when HFS+ volume had been created without
AttributesFile.  Such situation can take place because of using old
mkfs.hfs utility or creation HFS+ volume without taking in mind
necessity to use xattrs.  For example, Mac OS X 10.4 (Tiger) doesn't
create AttributesFile during mkfs phase.  Also it is a very frequent
situation for the case of users that created HFS+ volumes under Linux.
As a result, xattrs and POSIX ACLs on HFS+ volume are unavailable for
such users.

This patchset implements functionality of AttributesFile creation on
HFS+ volume in the case of this metadata file absence during operation
of xattr creation.

This patch:

Add functionality of metadata file's clump size calculation.  Operation
of AttributesFile creation needs in clump size setting.  This value will
be used when AttributesFile will be extended.

This code is adopted from code of newfs_hfs utility of diskdev_cmds packet
http://opensource.apple.com/tarballs/diskdev_cmds/.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/btree.c      | 112 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/hfsplus/hfsplus_fs.h |   1 +
 2 files changed, 113 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 0c6540c91167..0fcec8b2a90b 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -15,6 +15,118 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 
+/*
+ * Initial source code of clump size calculation is gotten
+ * from http://opensource.apple.com/tarballs/diskdev_cmds/
+ */
+#define CLUMP_ENTRIES	15
+
+static short clumptbl[CLUMP_ENTRIES * 3] = {
+/*
+ *	    Volume	Attributes	 Catalog	 Extents
+ *	     Size	Clump (MB)	Clump (MB)	Clump (MB)
+ */
+	/*   1GB */	  4,		  4,		 4,
+	/*   2GB */	  6,		  6,		 4,
+	/*   4GB */	  8,		  8,		 4,
+	/*   8GB */	 11,		 11,		 5,
+	/*
+	 * For volumes 16GB and larger, we want to make sure that a full OS
+	 * install won't require fragmentation of the Catalog or Attributes
+	 * B-trees.  We do this by making the clump sizes sufficiently large,
+	 * and by leaving a gap after the B-trees for them to grow into.
+	 *
+	 * For SnowLeopard 10A298, a FullNetInstall with all packages selected
+	 * results in:
+	 * Catalog B-tree Header
+	 *	nodeSize:          8192
+	 *	totalNodes:       31616
+	 *	freeNodes:         1978
+	 * (used = 231.55 MB)
+	 * Attributes B-tree Header
+	 *	nodeSize:          8192
+	 *	totalNodes:       63232
+	 *	freeNodes:          958
+	 * (used = 486.52 MB)
+	 *
+	 * We also want Time Machine backup volumes to have a sufficiently
+	 * large clump size to reduce fragmentation.
+	 *
+	 * The series of numbers for Catalog and Attribute form a geometric
+	 * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times
+	 * the previous term.  For Attributes (16GB to 512GB), each term is
+	 * 4**(1/5) times the previous term.  For 1TB to 16TB, each term is
+	 * 2**(1/5) times the previous term.
+	 */
+	/*  16GB */	 64,		 32,		 5,
+	/*  32GB */	 84,		 49,		 6,
+	/*  64GB */	111,		 74,		 7,
+	/* 128GB */	147,		111,		 8,
+	/* 256GB */	194,		169,		 9,
+	/* 512GB */	256,		256,		11,
+	/*   1TB */	294,		294,		14,
+	/*   2TB */	338,		338,		16,
+	/*   4TB */	388,		388,		20,
+	/*   8TB */	446,		446,		25,
+	/*  16TB */	512,		512,		32
+};
+
+u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size,
+					u64 sectors, int file_id)
+{
+	u32 mod = max(node_size, block_size);
+	u32 clump_size;
+	int column;
+	int i;
+
+	/* Figure out which column of the above table to use for this file. */
+	switch (file_id) {
+	case HFSPLUS_ATTR_CNID:
+		column = 0;
+		break;
+	case HFSPLUS_CAT_CNID:
+		column = 1;
+		break;
+	default:
+		column = 2;
+		break;
+	}
+
+	/*
+	 * The default clump size is 0.8% of the volume size. And
+	 * it must also be a multiple of the node and block size.
+	 */
+	if (sectors < 0x200000) {
+		clump_size = sectors << 2;	/*  0.8 %  */
+		if (clump_size < (8 * node_size))
+			clump_size = 8 * node_size;
+	} else {
+		/* turn exponent into table index... */
+		for (i = 0, sectors = sectors >> 22;
+		     sectors && (i < CLUMP_ENTRIES - 1);
+		     ++i, sectors = sectors >> 1) {
+			/* empty body */
+		}
+
+		clump_size = clumptbl[column + (i) * 3] * 1024 * 1024;
+	}
+
+	/*
+	 * Round the clump size to a multiple of node and block size.
+	 * NOTE: This rounds down.
+	 */
+	clump_size /= mod;
+	clump_size *= mod;
+
+	/*
+	 * Rounding down could have rounded down to 0 if the block size was
+	 * greater than the clump size.  If so, just use one block or node.
+	 */
+	if (clump_size == 0)
+		clump_size = mod;
+
+	return clump_size;
+}
 
 /* Get a reference to a B*Tree and do some initial checks */
 struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2b9cd01696e2..1e36f18e904b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -380,6 +380,7 @@ int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);
 int hfsplus_block_free(struct super_block *, u32, u32);
 
 /* btree.c */
+u32 hfsplus_calc_btree_clump_size(u32, u32, u64, int);
 struct hfs_btree *hfs_btree_open(struct super_block *, u32);
 void hfs_btree_close(struct hfs_btree *);
 int hfs_btree_write(struct hfs_btree *);
-- 
cgit v1.2.3


From 099e9245e04d50bb12ed621b4fa61df0a4c9dba9 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 12 Nov 2013 15:11:08 -0800
Subject: hfsplus: implement attributes file's header node initialization code

Implement functionality of AttributesFile's header node initialization.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/hfsplus_raw.h |  3 +++
 fs/hfsplus/xattr.c       | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 2c54dd81def4..8ffb3a8ffe75 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -187,6 +187,9 @@ struct hfs_btree_header_rec {
 /* HFS+ BTree misc info */
 #define HFSPLUS_TREE_HEAD 0
 #define HFSPLUS_NODE_MXSZ 32768
+#define HFSPLUS_ATTR_TREE_NODE_SIZE		8192
+#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT	3
+#define HFSPLUS_BTREE_HDR_USER_BYTES		128
 
 /* Some special File ID numbers (stolen from hfs.h) */
 #define HFSPLUS_POR_CNID		1	/* Parent Of the Root */
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index bd8471fb9a6a..568a45cb1145 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -127,6 +127,71 @@ static int can_set_xattr(struct inode *inode, const char *name,
 	return 0;
 }
 
+static void hfsplus_init_header_node(struct inode *attr_file,
+					u32 clump_size,
+					char *buf, size_t node_size)
+{
+	struct hfs_bnode_desc *desc;
+	struct hfs_btree_header_rec *head;
+	u16 offset;
+	__be16 *rec_offsets;
+	u32 hdr_node_map_rec_bits;
+	char *bmp;
+	u32 used_nodes;
+	u32 used_bmp_bytes;
+
+	hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %zu\n",
+				clump_size, node_size);
+
+	/* The end of the node contains list of record offsets */
+	rec_offsets = (__be16 *)(buf + node_size);
+
+	desc = (struct hfs_bnode_desc *)buf;
+	desc->type = HFS_NODE_HEADER;
+	desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT);
+	offset = sizeof(struct hfs_bnode_desc);
+	*--rec_offsets = cpu_to_be16(offset);
+
+	head = (struct hfs_btree_header_rec *)(buf + offset);
+	head->node_size = cpu_to_be16(node_size);
+	head->node_count = cpu_to_be32(i_size_read(attr_file) / node_size);
+	head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1);
+	head->clump_size = cpu_to_be32(clump_size);
+	head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS);
+	head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16));
+	offset += sizeof(struct hfs_btree_header_rec);
+	*--rec_offsets = cpu_to_be16(offset);
+	offset += HFSPLUS_BTREE_HDR_USER_BYTES;
+	*--rec_offsets = cpu_to_be16(offset);
+
+	hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16)));
+	if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) {
+		u32 map_node_bits;
+		u32 map_nodes;
+
+		desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1);
+		map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) -
+					(2 * sizeof(u16)) - 2);
+		map_nodes = (be32_to_cpu(head->node_count) -
+				hdr_node_map_rec_bits +
+				(map_node_bits - 1)) / map_node_bits;
+		be32_add_cpu(&head->free_nodes, 0 - map_nodes);
+	}
+
+	bmp = buf + offset;
+	used_nodes =
+		be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes);
+	used_bmp_bytes = used_nodes / 8;
+	if (used_bmp_bytes) {
+		memset(bmp, 0xFF, used_bmp_bytes);
+		bmp += used_bmp_bytes;
+		used_nodes %= 8;
+	}
+	*bmp = ~(0xFF >> used_nodes);
+	offset += hdr_node_map_rec_bits / 8;
+	*--rec_offsets = cpu_to_be16(offset);
+}
+
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags)
 {
-- 
cgit v1.2.3


From 95e0d7dbb9b28ab0dfad7c7316066b05e1f1d4cd Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 12 Nov 2013 15:11:09 -0800
Subject: hfsplus: implement attributes file creation functionality

Implement functionality of creation AttributesFile metadata file on HFS+
volume in the case of absence of it.

It makes trying to open AttributesFile's B-tree during mount of HFS+
volume.  If HFS+ volume hasn't AttributesFile then a pointer on
AttributesFile's B-tree keeps as NULL.  Thereby, when it is discovered
absence of AttributesFile on HFS+ volume in the begin of xattr creation
operation then AttributesFile will be created.

The creation of AttributesFile will have success in the case of
availability (2 * clump) free blocks on HFS+ volume.  Otherwise,
creation operation is ended with error (-ENOSPC).

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/hfsplus_fs.h |   9 +++
 fs/hfsplus/super.c      |   2 +
 fs/hfsplus/xattr.c      | 142 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 151 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 1e36f18e904b..08846425b67f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -126,6 +126,14 @@ struct hfs_bnode {
 #define HFS_BNODE_DIRTY		3
 #define HFS_BNODE_DELETED	4
 
+/*
+ * Attributes file states
+ */
+#define HFSPLUS_EMPTY_ATTR_TREE		0
+#define HFSPLUS_CREATING_ATTR_TREE	1
+#define HFSPLUS_VALID_ATTR_TREE		2
+#define HFSPLUS_FAILED_ATTR_TREE	3
+
 /*
  * HFS+ superblock info (built from Volume Header on disk)
  */
@@ -141,6 +149,7 @@ struct hfsplus_sb_info {
 	struct hfs_btree *ext_tree;
 	struct hfs_btree *cat_tree;
 	struct hfs_btree *attr_tree;
+	atomic_t attr_tree_state;
 	struct inode *alloc_file;
 	struct inode *hidden_dir;
 	struct nls_table *nls;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4c4d142cf890..80875aa640ef 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -474,12 +474,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		pr_err("failed to load catalog file\n");
 		goto out_close_ext_tree;
 	}
+	atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
 	if (vhdr->attr_file.total_blocks != 0) {
 		sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
 		if (!sbi->attr_tree) {
 			pr_err("failed to load attributes file\n");
 			goto out_close_cat_tree;
 		}
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
 	}
 	sb->s_xattr = hfsplus_xattr_handlers;
 
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 568a45cb1145..efc85b1377cc 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -192,6 +192,143 @@ static void hfsplus_init_header_node(struct inode *attr_file,
 	*--rec_offsets = cpu_to_be16(offset);
 }
 
+static int hfsplus_create_attributes_file(struct super_block *sb)
+{
+	int err = 0;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct inode *attr_file;
+	struct hfsplus_inode_info *hip;
+	u32 clump_size;
+	u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE;
+	char *buf;
+	int index, written;
+	struct address_space *mapping;
+	struct page *page;
+	int old_state = HFSPLUS_EMPTY_ATTR_TREE;
+
+	hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID);
+
+check_attr_tree_state_again:
+	switch (atomic_read(&sbi->attr_tree_state)) {
+	case HFSPLUS_EMPTY_ATTR_TREE:
+		if (old_state != atomic_cmpxchg(&sbi->attr_tree_state,
+						old_state,
+						HFSPLUS_CREATING_ATTR_TREE))
+			goto check_attr_tree_state_again;
+		break;
+	case HFSPLUS_CREATING_ATTR_TREE:
+		/*
+		 * This state means that another thread is in process
+		 * of AttributesFile creation. Theoretically, it is
+		 * possible to be here. But really __setxattr() method
+		 * first of all calls hfs_find_init() for lookup in
+		 * B-tree of CatalogFile. This method locks mutex of
+		 * CatalogFile's B-tree. As a result, if some thread
+		 * is inside AttributedFile creation operation then
+		 * another threads will be waiting unlocking of
+		 * CatalogFile's B-tree's mutex. However, if code will
+		 * change then we will return error code (-EAGAIN) from
+		 * here. Really, it means that first try to set of xattr
+		 * fails with error but second attempt will have success.
+		 */
+		return -EAGAIN;
+	case HFSPLUS_VALID_ATTR_TREE:
+		return 0;
+	case HFSPLUS_FAILED_ATTR_TREE:
+		return -EOPNOTSUPP;
+	default:
+		BUG();
+	}
+
+	attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID);
+	if (IS_ERR(attr_file)) {
+		pr_err("failed to load attributes file\n");
+		return PTR_ERR(attr_file);
+	}
+
+	BUG_ON(i_size_read(attr_file) != 0);
+
+	hip = HFSPLUS_I(attr_file);
+
+	clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize,
+						    node_size,
+						    sbi->sect_count,
+						    HFSPLUS_ATTR_CNID);
+
+	mutex_lock(&hip->extents_lock);
+	hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift;
+	mutex_unlock(&hip->extents_lock);
+
+	if (sbi->free_blocks <= (hip->clump_blocks << 1)) {
+		err = -ENOSPC;
+		goto end_attr_file_creation;
+	}
+
+	while (hip->alloc_blocks < hip->clump_blocks) {
+		err = hfsplus_file_extend(attr_file);
+		if (unlikely(err)) {
+			pr_err("failed to extend attributes file\n");
+			goto end_attr_file_creation;
+		}
+		hip->phys_size = attr_file->i_size =
+			(loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift;
+		hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift;
+		inode_set_bytes(attr_file, attr_file->i_size);
+	}
+
+	buf = kzalloc(node_size, GFP_NOFS);
+	if (!buf) {
+		pr_err("failed to allocate memory for header node\n");
+		err = -ENOMEM;
+		goto end_attr_file_creation;
+	}
+
+	hfsplus_init_header_node(attr_file, clump_size, buf, node_size);
+
+	mapping = attr_file->i_mapping;
+
+	index = 0;
+	written = 0;
+	for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+		void *kaddr;
+
+		page = read_mapping_page(mapping, index, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed_header_node_init;
+		}
+
+		kaddr = kmap_atomic(page);
+		memcpy(kaddr, buf + written,
+			min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+		kunmap_atomic(kaddr);
+
+		set_page_dirty(page);
+		page_cache_release(page);
+	}
+
+	hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
+
+	sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+	if (!sbi->attr_tree)
+		pr_err("failed to load attributes file\n");
+
+failed_header_node_init:
+	kfree(buf);
+
+end_attr_file_creation:
+	iput(attr_file);
+
+	if (!err)
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
+	else if (err == -ENOSPC)
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
+	else
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE);
+
+	return err;
+}
+
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags)
 {
@@ -276,8 +413,9 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
 	}
 
 	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) {
-		err = -EOPNOTSUPP;
-		goto end_setxattr;
+		err = hfsplus_create_attributes_file(inode->i_sb);
+		if (unlikely(err))
+			goto end_setxattr;
 	}
 
 	if (hfsplus_attr_exists(inode, name)) {
-- 
cgit v1.2.3


From 5721cf84d9d8d41915f77dd639969b7fa91b6a79 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Tue, 12 Nov 2013 15:11:15 -0800
Subject: procfs: clean up proc_reg_get_unmapped_area for 80-column limit

Clean up proc_reg_get_unmapped_area due to its 80-column limit
violation.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Tested-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8eaa1ba793fc..28955d4b7218 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -285,19 +285,23 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 	return rv;
 }
 
-static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+			   unsigned long len, unsigned long pgoff,
+			   unsigned long flags)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned long rv = -EIO;
-	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL;
+	unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
+				  unsigned long, unsigned long) = NULL;
 	if (use_pde(pde)) {
 #ifdef CONFIG_MMU
-		get_unmapped_area = current->mm->get_unmapped_area;
+		get_area = current->mm->get_unmapped_area;
 #endif
 		if (pde->proc_fops->get_unmapped_area)
-			get_unmapped_area = pde->proc_fops->get_unmapped_area;
-		if (get_unmapped_area)
-			rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
+			get_area = pde->proc_fops->get_unmapped_area;
+		if (get_area)
+			rv = get_area(file, orig_addr, len, pgoff, flags);
 		unuse_pde(pde);
 	}
 	return rv;
-- 
cgit v1.2.3


From 1c3fc3e5cc8a81d21b199cb739d5d9c51f3688f4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 12 Nov 2013 15:11:16 -0800
Subject: kcore: add Kconfig help text

Under Pseudo filesystems, /proc/kcore support has no help.

Fixes a portion of kernel bugzilla #52671:
  https://bugzilla.kernel.org/show_bug.cgi?id=52671

Thanks for David Howells for the help text.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: <lailavrazda1979@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af6222f8a4..2183fcf41d59 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,10 @@ config PROC_FS
 config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
+	help
+	  Provides a virtual ELF core file of the live kernel.  This can
+	  be read with gdb and other ELF tools.  No modifications can be
+	  made using this mechanism.
 
 config PROC_VMCORE
 	bool "/proc/vmcore support"
-- 
cgit v1.2.3


From d049f74f2dbe71354d43d393ac3a188947811348 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 12 Nov 2013 15:11:17 -0800
Subject: exec/ptrace: fix get_dumpable() incorrect tests

The get_dumpable() return value is not boolean.  Most users of the
function actually want to be testing for non-SUID_DUMP_USER(1) rather than
SUID_DUMP_DISABLE(0).  The SUID_DUMP_ROOT(2) is also considered a
protected state.  Almost all places did this correctly, excepting the two
places fixed in this patch.

Wrong logic:
    if (dumpable == SUID_DUMP_DISABLE) { /* be protective */ }
        or
    if (dumpable == 0) { /* be protective */ }
        or
    if (!dumpable) { /* be protective */ }

Correct logic:
    if (dumpable != SUID_DUMP_USER) { /* be protective */ }
        or
    if (dumpable != 1) { /* be protective */ }

Without this patch, if the system had set the sysctl fs/suid_dumpable=2, a
user was able to ptrace attach to processes that had dropped privileges to
that user.  (This may have been partially mitigated if Yama was enabled.)

The macros have been moved into the file that declares get/set_dumpable(),
which means things like the ia64 code can see them too.

CVE-2013-2929

Reported-by: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/processor.h | 2 +-
 fs/exec.c                         | 6 ++++++
 include/linux/binfmts.h           | 3 ---
 include/linux/sched.h             | 4 ++++
 kernel/ptrace.c                   | 3 ++-
 5 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index e0a899a1a8a6..5a84b3a50741 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -319,7 +319,7 @@ struct thread_struct {
 	regs->loadrs = 0;									\
 	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
 	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
-	if (unlikely(!get_dumpable(current->mm))) {							\
+	if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) {	\
 		/*										\
 		 * Zap scratch regs to avoid leaking bits between processes with different	\
 		 * uid/privileges.								\
diff --git a/fs/exec.c b/fs/exec.c
index 2ea437e5acf4..12120620f040 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1669,6 +1669,12 @@ int __get_dumpable(unsigned long mm_flags)
 	return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
 }
 
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
 int get_dumpable(struct mm_struct *mm)
 {
 	return __get_dumpable(mm->flags);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e8112ae50531..7554fd410bcc 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -99,9 +99,6 @@ extern void setup_new_exec(struct linux_binprm * bprm);
 extern void would_dump(struct linux_binprm *, struct file *);
 
 extern int suid_dumpable;
-#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
-#define SUID_DUMP_USER		1	/* Dump as user of process */
-#define SUID_DUMP_ROOT		2	/* Dump as root */
 
 /* Stack area protections */
 #define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e226fe3e512..f7efc8604652 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -323,6 +323,10 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
 
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
 /* mm flags */
 /* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dd562e9aa2c8..1f4bcb3cc21c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,7 +257,8 @@ ok:
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
 	rcu_read_lock();
-	if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+	if (dumpable != SUID_DUMP_USER &&
+	    !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
-- 
cgit v1.2.3


From 66da0e1f9034140ae2f571ef96e254a25083906c Mon Sep 17 00:00:00 2001
From: Ilija Hadzic <ihadzic@research.bell-labs.com>
Date: Tue, 12 Nov 2013 15:11:45 -0800
Subject: devpts: plug the memory leak in kill_sb

When devpts is unmounted, there may be a no-longer-used IDR tree hanging
off the superblock we are about to kill.  This needs to be cleaned up
before destroying the SB.

The leak is usually not a big deal because unmounting devpts is typically
done when shutting down the whole machine.  However, shutting down an LXC
container instead of a physical machine exposes the problem (the garbage
is detectable with kmemleak).

Signed-off-by: Ilija Hadzic <ihadzic@research.bell-labs.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 073d30b9d1ac..a726b9f29cb7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -498,6 +498,7 @@ static void devpts_kill_sb(struct super_block *sb)
 {
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 
+	ida_destroy(&fsi->allocated_ptys);
 	kfree(fsi);
 	kill_litter_super(sb);
 }
-- 
cgit v1.2.3