From 6d243a235612946971ba98f24f52dc99f4ebb32a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 21 Feb 2018 16:35:50 -0500
Subject: NFSv4: Fix broken cast in nfs4_callback_recallany()

Passing a pointer to a unsigned integer to test_bit() is broken.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2435af56b87e..a50d7813e3ea 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -572,7 +572,7 @@ out:
 }
 
 static bool
-validate_bitmap_values(unsigned long mask)
+validate_bitmap_values(unsigned int mask)
 {
 	return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
 }
@@ -596,17 +596,15 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
 		goto out;
 
 	status = cpu_to_be32(NFS4_OK);
-	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
-		     &args->craa_type_mask))
+	if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG))
 		flags = FMODE_READ;
-	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
-		     &args->craa_type_mask))
+	if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG))
 		flags |= FMODE_WRITE;
-	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
-		     &args->craa_type_mask))
-		pnfs_recall_all_layouts(cps->clp);
 	if (flags)
 		nfs_expire_unused_delegation_types(cps->clp, flags);
+
+	if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
+		pnfs_recall_all_layouts(cps->clp);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
-- 
cgit v1.2.3


From ad86f605c59500da82d196ac312cfbac3daba31d Mon Sep 17 00:00:00 2001
From: "Bill.Baker@oracle.com" <Bill.Baker@oracle.com>
Date: Wed, 21 Feb 2018 12:46:43 -0600
Subject: nfs: system crashes after NFS4ERR_MOVED recovery

nfs4_update_server unconditionally releases the nfs_client for the
source server. If migration fails, this can cause the source server's
nfs_client struct to be left with a low reference count, resulting in
use-after-free.  Also, adjust reference count handling for ELOOP.

NFS: state manager: migration failed on NFSv4 server nfsvmu10 with error 6
WARNING: CPU: 16 PID: 17960 at fs/nfs/client.c:281 nfs_put_client+0xfa/0x110 [nfs]()
	nfs_put_client+0xfa/0x110 [nfs]
	nfs4_run_state_manager+0x30/0x40 [nfsv4]
	kthread+0xd8/0xf0

BUG: unable to handle kernel NULL pointer dereference at 00000000000002a8
	nfs4_xdr_enc_write+0x6b/0x160 [nfsv4]
	rpcauth_wrap_req+0xac/0xf0 [sunrpc]
	call_transmit+0x18c/0x2c0 [sunrpc]
	__rpc_execute+0xa6/0x490 [sunrpc]
	rpc_async_schedule+0x15/0x20 [sunrpc]
	process_one_work+0x160/0x470
	worker_thread+0x112/0x540
	? rescuer_thread+0x3f0/0x3f0
	kthread+0xd8/0xf0

This bug was introduced by 32e62b7c ("NFS: Add nfs4_update_server"),
but the fix applies cleanly to 52442f9b ("NFS4: Avoid migration loops")

Reported-by: Helen Chao <helen.chao@oracle.com>
Fixes: 52442f9b11b7 ("NFS4: Avoid migration loops")
Signed-off-by: Bill Baker <bill.baker@oracle.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4client.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 04612c24d394..979631411a0e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -868,8 +868,10 @@ static int nfs4_set_client(struct nfs_server *server,
 	if (IS_ERR(clp))
 		return PTR_ERR(clp);
 
-	if (server->nfs_client == clp)
+	if (server->nfs_client == clp) {
+		nfs_put_client(clp);
 		return -ELOOP;
+	}
 
 	/*
 	 * Query for the lease time on clientid setup or renewal
@@ -1244,11 +1246,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
 				clp->cl_proto, clnt->cl_timeout,
 				clp->cl_minorversion, net);
 	clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
-	nfs_put_client(clp);
 	if (error != 0) {
 		nfs_server_insert_lists(server);
 		return error;
 	}
+	nfs_put_client(clp);
 
 	if (server->nfs_client->cl_hostname == NULL)
 		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
-- 
cgit v1.2.3


From 1b7204064582792b77c6be796e78bd821c9f71b1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 7 Feb 2018 11:27:54 +0000
Subject: NFS: make struct nlmclnt_fl_close_lock_ops static

The structure nlmclnt_fl_close_lock_ops s local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse warning:
fs/nfs/nfs3proc.c:876:33: warning: symbol 'nlmclnt_fl_close_lock_ops' was not
declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs3proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 49f848fd1f04..7327930ad970 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -873,7 +873,7 @@ static void nfs3_nlm_release_call(void *data)
 	}
 }
 
-const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
+static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
 	.nlmclnt_alloc_call = nfs3_nlm_alloc_call,
 	.nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare,
 	.nlmclnt_release_call = nfs3_nlm_release_call,
-- 
cgit v1.2.3


From 86516eff3b09a5fd17e81d50925bbccc6a36beed Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 22 Feb 2018 14:41:25 -0800
Subject: xfs: use memset to initialize xfs_scrub_agfl_info

Apparently different gcc versions have competing and
incompatible notions of how to initialize at declaration,
so just give up and fall back to the time-tested memset().

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/agheader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index fd975524f460..05c66e05ae20 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -767,7 +767,7 @@ int
 xfs_scrub_agfl(
 	struct xfs_scrub_context	*sc)
 {
-	struct xfs_scrub_agfl_info	sai = { 0 };
+	struct xfs_scrub_agfl_info	sai;
 	struct xfs_agf			*agf;
 	xfs_agnumber_t			agno;
 	unsigned int			agflcount;
@@ -795,6 +795,7 @@ xfs_scrub_agfl(
 		xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
 		goto out;
 	}
+	memset(&sai, 0, sizeof(sai));
 	sai.sz_entries = agflcount;
 	sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
 	if (!sai.entries) {
-- 
cgit v1.2.3


From b31c2bdcd83e3374fec5a8e27a2fb4d26e771c52 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 22 Feb 2018 14:41:25 -0800
Subject: xfs: reserve blocks for refcount / rmap log item recovery

During log recovery, the per-AG reservations aren't yet set up, so log
recovery has to reserve enough blocks to handle all possible btree
splits.

Reported-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_refcount_item.c | 9 ++++++---
 fs/xfs/xfs_rmap_item.c     | 4 +++-
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 3a55d6fc271b..7a39f40645f7 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -23,6 +23,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
+#include "xfs_shared.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_trans.h"
@@ -456,10 +457,12 @@ xfs_cui_recover(
 	 * transaction.  Normally, any work that needs to be deferred
 	 * gets attached to the same defer_ops that scheduled the
 	 * refcount update.  However, we're in log recovery here, so we
-	 * we create our own defer_ops and use that to finish up any
-	 * work that doesn't fit.
+	 * we use the passed in defer_ops and to finish up any work that
+	 * doesn't fit.  We need to reserve enough blocks to handle a
+	 * full btree split on either end of the refcount range.
 	 */
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+			mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
 	if (error)
 		return error;
 	cudp = xfs_trans_get_cud(tp, cuip);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index f3b139c9aa16..49d3124863a8 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -23,6 +23,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
+#include "xfs_shared.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_trans.h"
@@ -470,7 +471,8 @@ xfs_rui_recover(
 		}
 	}
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+			mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
 	if (error)
 		return error;
 	rudp = xfs_trans_get_rud(tp, ruip);
-- 
cgit v1.2.3


From 6ef0bc6ddee1f62310877a1d53b1ea1d0d8e51a2 Mon Sep 17 00:00:00 2001
From: Zhi Zhang <zhang.david2011@gmail.com>
Date: Wed, 24 Jan 2018 21:24:33 +0800
Subject: ceph: flush dirty caps of unlinked inode ASAP

Client should release unlinked inode from its cache ASAP. But client
can't release inode with dirty caps.

Link: http://tracker.ceph.com/issues/22886
Signed-off-by: Zhi Zhang <zhang.david2011@gmail.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c  | 26 ++++++++++++++++++++++++++
 fs/ceph/dir.c   | 28 +++++-----------------------
 fs/ceph/super.h |  2 +-
 3 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6582c4507e6c..0e5bd3e3344e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3964,6 +3964,32 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
 		ceph_check_caps(ci, 0, NULL);
 }
 
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+int ceph_drop_caps_for_unlink(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (inode->i_nlink == 1) {
+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+
+		ci->i_ceph_flags |= CEPH_I_NODELAY;
+		if (__ceph_caps_dirty(ci)) {
+			struct ceph_mds_client *mdsc =
+				ceph_inode_to_client(inode)->mdsc;
+			__cap_delay_requeue_front(mdsc, ci);
+		}
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	return drop;
+}
+
 /*
  * Helpers for embedding cap and dentry lease releases into mds
  * requests.
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0c4346806e17..f1d9c6cc0491 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1002,26 +1002,6 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	return err;
 }
 
-/*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
- * looks like the link count will hit 0, drop any other caps (other
- * than PIN) we don't specifically want (due to the file still being
- * open).
- */
-static int drop_caps_for_unlink(struct inode *inode)
-{
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-
-	spin_lock(&ci->i_ceph_lock);
-	if (inode->i_nlink == 1) {
-		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
-		ci->i_ceph_flags |= CEPH_I_NODELAY;
-	}
-	spin_unlock(&ci->i_ceph_lock);
-	return drop;
-}
-
 /*
  * rmdir and unlink are differ only by the metadata op code
  */
@@ -1056,7 +1036,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
-	req->r_inode_drop = drop_caps_for_unlink(inode);
+	req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
 	err = ceph_mdsc_do_request(mdsc, dir, req);
 	if (!err && !req->r_reply_info.head->is_dentry)
 		d_delete(dentry);
@@ -1104,8 +1084,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
 	/* release LINK_RDCACHE on source inode (mds will lock it) */
 	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-	if (d_really_is_positive(new_dentry))
-		req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry));
+	if (d_really_is_positive(new_dentry)) {
+		req->r_inode_drop =
+			ceph_drop_caps_for_unlink(d_inode(new_dentry));
+	}
 	err = ceph_mdsc_do_request(mdsc, old_dir, req);
 	if (!err && !req->r_reply_info.head->is_dentry) {
 		/*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 21b2e5b004eb..1c2086e0fec2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -987,7 +987,7 @@ extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 			    struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
 extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
-
+extern int  ceph_drop_caps_for_unlink(struct inode *inode);
 extern int ceph_encode_inode_release(void **p, struct inode *inode,
 				     int mds, int drop, int unless, int force);
 extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
-- 
cgit v1.2.3


From 937441f3a3158d5510ca8cc78a82453f57a96365 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@icloud.com>
Date: Tue, 6 Feb 2018 08:25:55 +0800
Subject: libceph, ceph: avoid memory leak when specifying same option several
 times

When parsing string option, in order to avoid memory leak we need to
carefully free it first in case of specifying same option several times.

Signed-off-by: Chengguang Xu <cgxu519@icloud.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c        | 2 ++
 net/ceph/ceph_common.c | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a62d2a9841dc..bfc85b22a190 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -225,6 +225,7 @@ static int parse_fsopt_token(char *c, void *private)
 			return -ENOMEM;
 		break;
 	case Opt_mds_namespace:
+		kfree(fsopt->mds_namespace);
 		fsopt->mds_namespace = kstrndup(argstr[0].from,
 						argstr[0].to-argstr[0].from,
 						GFP_KERNEL);
@@ -232,6 +233,7 @@ static int parse_fsopt_token(char *c, void *private)
 			return -ENOMEM;
 		break;
 	case Opt_fscache_uniq:
+		kfree(fsopt->fscache_uniq);
 		fsopt->fscache_uniq = kstrndup(argstr[0].from,
 					       argstr[0].to-argstr[0].from,
 					       GFP_KERNEL);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 1e492ef2a33d..4d4c82229e9e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -418,6 +418,7 @@ ceph_parse_options(char *options, const char *dev_name,
 				opt->flags |= CEPH_OPT_FSID;
 			break;
 		case Opt_name:
+			kfree(opt->name);
 			opt->name = kstrndup(argstr[0].from,
 					      argstr[0].to-argstr[0].from,
 					      GFP_KERNEL);
@@ -427,6 +428,9 @@ ceph_parse_options(char *options, const char *dev_name,
 			}
 			break;
 		case Opt_secret:
+			ceph_crypto_key_destroy(opt->key);
+			kfree(opt->key);
+
 		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
 			if (!opt->key) {
 				err = -ENOMEM;
@@ -437,6 +441,9 @@ ceph_parse_options(char *options, const char *dev_name,
 				goto out;
 			break;
 		case Opt_key:
+			ceph_crypto_key_destroy(opt->key);
+			kfree(opt->key);
+
 		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
 			if (!opt->key) {
 				err = -ENOMEM;
-- 
cgit v1.2.3


From 18106734b512664a8541026519ce4b862498b6c3 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@icloud.com>
Date: Fri, 9 Feb 2018 20:40:59 +0800
Subject: ceph: fix dentry leak when failing to init debugfs

When failing from ceph_fs_debugfs_init() in ceph_real_mount(),
there is lack of dput of root_dentry and it causes slab errors,
so change the calling order of ceph_fs_debugfs_init() and
open_root_dentry() and do some cleanups to avoid this issue.

Signed-off-by: Chengguang Xu <cgxu519@icloud.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bfc85b22a190..1c470b453a9e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -838,7 +838,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 	int err;
 	unsigned long started = jiffies;  /* note the start time */
 	struct dentry *root;
-	int first = 0;   /* first vfsmount for this super_block */
 
 	dout("mount start %p\n", fsc);
 	mutex_lock(&fsc->client->mount_mutex);
@@ -863,17 +862,17 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 			path = fsc->mount_options->server_path + 1;
 			dout("mount opening path %s\n", path);
 		}
+
+		err = ceph_fs_debugfs_init(fsc);
+		if (err < 0)
+			goto out;
+
 		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
 			goto out;
 		}
 		fsc->sb->s_root = dget(root);
-		first = 1;
-
-		err = ceph_fs_debugfs_init(fsc);
-		if (err < 0)
-			goto fail;
 	} else {
 		root = dget(fsc->sb->s_root);
 	}
@@ -883,11 +882,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 	mutex_unlock(&fsc->client->mount_mutex);
 	return root;
 
-fail:
-	if (first) {
-		dput(fsc->sb->s_root);
-		fsc->sb->s_root = NULL;
-	}
 out:
 	mutex_unlock(&fsc->client->mount_mutex);
 	return ERR_PTR(err);
-- 
cgit v1.2.3


From d9c10e5b8863cfb6886d1640386455075c6e979d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 12:51:43 +0100
Subject: direct-io: Fix sleep in atomic due to sync AIO

Commit e864f39569f4 "fs: add RWF_DSYNC aand RWF_SYNC" added additional
way for direct IO to become synchronous and thus trigger fsync from the
IO completion handler. Then commit 9830f4be159b "fs: Use RWF_* flags for
AIO operations" allowed these flags to be set for AIO as well. However
that commit forgot to update the condition checking whether the IO
completion handling should be defered to a workqueue and thus AIO DIO
with RWF_[D]SYNC set will call fsync() from IRQ context resulting in
sleep in atomic.

Fix the problem by checking directly iocb flags (the same way as it is
done in dio_complete()) instead of checking all conditions that could
lead to IO being synchronous.

CC: Christoph Hellwig <hch@lst.de>
CC: Goldwyn Rodrigues <rgoldwyn@suse.com>
CC: stable@vger.kernel.org
Reported-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 9830f4be159b29399d107bffb99e0132bc5aedd4
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/direct-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index a0ca9e48e993..1357ef563893 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1274,8 +1274,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 */
 	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
 		retval = 0;
-		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
-		    IS_SYNC(iocb->ki_filp->f_mapping->host))
+		if (iocb->ki_flags & IOCB_DSYNC)
 			retval = dio_set_defer_completion(dio);
 		else if (!dio->inode->i_sb->s_dio_done_wq) {
 			/*
-- 
cgit v1.2.3


From 9df6c29912315186fef1c79cc15b758ace84175b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 13:01:39 +0100
Subject: genhd: Add helper put_disk_and_module()

Add a proper counterpart to get_disk_and_module() -
put_disk_and_module(). Currently it is opencoded in several places.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c    | 11 ++---------
 block/genhd.c         | 20 ++++++++++++++++----
 fs/block_dev.c        | 19 +++++--------------
 include/linux/genhd.h |  1 +
 4 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4117524ca45b..c2033a232a44 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -812,7 +812,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	struct gendisk *disk;
 	struct request_queue *q;
 	struct blkcg_gq *blkg;
-	struct module *owner;
 	unsigned int major, minor;
 	int key_len, part, ret;
 	char *body;
@@ -904,9 +903,7 @@ fail_unlock:
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 fail:
-	owner = disk->fops->owner;
-	put_disk(disk);
-	module_put(owner);
+	put_disk_and_module(disk);
 	/*
 	 * If queue was bypassing, we should retry.  Do so after a
 	 * short msleep().  It isn't strictly necessary but queue
@@ -931,13 +928,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
-	struct module *owner;
-
 	spin_unlock_irq(ctx->disk->queue->queue_lock);
 	rcu_read_unlock();
-	owner = ctx->disk->fops->owner;
-	put_disk(ctx->disk);
-	module_put(owner);
+	put_disk_and_module(ctx->disk);
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
diff --git a/block/genhd.c b/block/genhd.c
index 21b2843b27d0..4c0590434591 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -817,10 +817,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 	}
 
 	if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
-		struct module *owner = disk->fops->owner;
-
-		put_disk(disk);
-		module_put(owner);
+		put_disk_and_module(disk);
 		disk = NULL;
 	}
 	return disk;
@@ -1483,6 +1480,21 @@ void put_disk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(put_disk);
 
+/*
+ * This is a counterpart of get_disk_and_module() and thus also of
+ * get_gendisk().
+ */
+void put_disk_and_module(struct gendisk *disk)
+{
+	if (disk) {
+		struct module *owner = disk->fops->owner;
+
+		put_disk(disk);
+		module_put(owner);
+	}
+}
+EXPORT_SYMBOL(put_disk_and_module);
+
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 {
 	char event[] = "DISK_RO=1";
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fcb5175..1dbbf847911a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1111,8 +1111,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 	else
 		whole = bdgrab(bdev);
 
-	module_put(disk->fops->owner);
-	put_disk(disk);
+	put_disk_and_module(disk);
 	if (!whole)
 		return ERR_PTR(-ENOMEM);
 
@@ -1407,7 +1406,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
-	struct module *owner;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1433,7 +1431,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
-	owner = disk->fops->owner;
 
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1463,8 +1460,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdev->bd_queue = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
-					put_disk(disk);
-					module_put(owner);
+					put_disk_and_module(disk);
 					goto restart;
 				}
 			}
@@ -1525,8 +1521,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_unlock_bdev;
 		}
 		/* only one opener holds refs to the module and disk */
-		put_disk(disk);
-		module_put(owner);
+		put_disk_and_module(disk);
 	}
 	bdev->bd_openers++;
 	if (for_part)
@@ -1546,8 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
-	put_disk(disk);
-	module_put(owner);
+	put_disk_and_module(disk);
  out:
 	bdput(bdev);
 
@@ -1770,8 +1764,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 			disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
-		struct module *owner = disk->fops->owner;
-
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
@@ -1779,8 +1771,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
 
-		put_disk(disk);
-		module_put(owner);
+		put_disk_and_module(disk);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 8e11b9321e55..7f5906fe1b70 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -602,6 +602,7 @@ extern void printk_all_partitions(void);
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
 extern struct kobject *get_disk_and_module(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
+extern void put_disk_and_module(struct gendisk *disk);
 extern void blk_register_region(dev_t devt, unsigned long range,
 			struct module *module,
 			struct kobject *(*probe)(dev_t, int *, void *),
-- 
cgit v1.2.3


From 897366537fb65e87755b822360c230354c3fc73b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 13:01:40 +0100
Subject: genhd: Fix use after free in __blkdev_get()

When two blkdev_open() calls race with device removal and recreation,
__blkdev_get() can use looked up gendisk after it is freed:

CPU0				CPU1			CPU2
							del_gendisk(disk);
							  bdev_unhash_inode(inode);
blkdev_open()			blkdev_open()
  bdev = bd_acquire(inode);
    - creates and returns new inode
				  bdev = bd_acquire(inode);
				    - returns the same inode
  __blkdev_get(devt)		  __blkdev_get(devt)
    disk = get_gendisk(devt);
      - got structure of device going away
							<finish device removal>
							<new device gets
							 created under the same
							 device number>
				  disk = get_gendisk(devt);
				    - got new device structure
				  if (!bdev->bd_openers) {
				    does the first open
				  }
    if (!bdev->bd_openers)
      - false
    } else {
      put_disk_and_module(disk)
        - remember this was old device - this was last ref and disk is
          now freed
    }
    disk_unblock_events(disk); -> oops

Fix the problem by making sure we drop reference to disk in
__blkdev_get() only after we are really done with it.

Reported-by: Hou Tao <houtao1@huawei.com>
Tested-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1dbbf847911a..fe41a76769fa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1409,6 +1409,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	int ret;
 	int partno;
 	int perm = 0;
+	bool first_open = false;
 
 	if (mode & FMODE_READ)
 		perm |= MAY_READ;
@@ -1435,6 +1436,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
+		first_open = true;
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
@@ -1520,14 +1522,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (ret)
 				goto out_unlock_bdev;
 		}
-		/* only one opener holds refs to the module and disk */
-		put_disk_and_module(disk);
 	}
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
+	/* only one opener holds refs to the module and disk */
+	if (!first_open)
+		put_disk_and_module(disk);
 	return 0;
 
  out_clear:
-- 
cgit v1.2.3


From 560e7cb2f3c7f09bbfb36cd0b900e24fddd20282 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Feb 2018 13:01:42 +0100
Subject: blockdev: Avoid two active bdev inodes for one device

When blkdev_open() races with device removal and creation it can happen
that unhashed bdev inode gets associated with newly created gendisk
like:

CPU0					CPU1
blkdev_open()
  bdev = bd_acquire()
					del_gendisk()
					  bdev_unhash_inode(bdev);
					remove device
					create new device with the same number
  __blkdev_get()
    disk = get_gendisk()
      - gets reference to gendisk of the new device

Now another blkdev_open() will not find original 'bdev' as it got
unhashed, create a new one and associate it with the same 'disk' at
which point problems start as we have two independent page caches for
one device.

Fix the problem by verifying that the bdev inode didn't get unhashed
before we acquired gendisk reference. That way we make sure gendisk can
get associated only with visible bdev inodes.

Tested-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe41a76769fa..fe09ef9c21f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1058,6 +1058,27 @@ retry:
 	return 0;
 }
 
+static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
+{
+	struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);
+
+	if (!disk)
+		return NULL;
+	/*
+	 * Now that we hold gendisk reference we make sure bdev we looked up is
+	 * not stale. If it is, it means device got removed and created before
+	 * we looked up gendisk and we fail open in such case. Associating
+	 * unhashed bdev with newly created gendisk could lead to two bdevs
+	 * (and thus two independent caches) being associated with one device
+	 * which is bad.
+	 */
+	if (inode_unhashed(bdev->bd_inode)) {
+		put_disk_and_module(disk);
+		return NULL;
+	}
+	return disk;
+}
+
 /**
  * bd_start_claiming - start claiming a block device
  * @bdev: block device of interest
@@ -1094,7 +1115,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 	 * @bdev might not have been initialized properly yet, look up
 	 * and grab the outer block device the hard way.
 	 */
-	disk = get_gendisk(bdev->bd_dev, &partno);
+	disk = bdev_get_gendisk(bdev, &partno);
 	if (!disk)
 		return ERR_PTR(-ENXIO);
 
@@ -1429,7 +1450,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  restart:
 
 	ret = -ENXIO;
-	disk = get_gendisk(bdev->bd_dev, &partno);
+	disk = bdev_get_gendisk(bdev, &partno);
 	if (!disk)
 		goto out;
 
-- 
cgit v1.2.3


From 5b4c845ea4f4b86c43096eb924354c83a2e26f3c Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@icloud.com>
Date: Sun, 25 Feb 2018 12:17:31 -0800
Subject: xfs: fix potential memory leak in mount option parsing

When specifying string type mount option (e.g., logdev)
several times in a mount, current option parsing may
cause memory leak. Hence, call kfree for previous one
in this case.

Signed-off-by: Chengguang Xu <cgxu519@icloud.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 7aba628dc527..93588ea3d3d2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -250,6 +250,7 @@ xfs_parseargs(
 				return -EINVAL;
 			break;
 		case Opt_logdev:
+			kfree(mp->m_logname);
 			mp->m_logname = match_strdup(args);
 			if (!mp->m_logname)
 				return -ENOMEM;
@@ -258,6 +259,7 @@ xfs_parseargs(
 			xfs_warn(mp, "%s option not allowed on this system", p);
 			return -EINVAL;
 		case Opt_rtdev:
+			kfree(mp->m_rtname);
 			mp->m_rtname = match_strdup(args);
 			if (!mp->m_rtname)
 				return -ENOMEM;
-- 
cgit v1.2.3


From a8fd1f71749387c9a1053a83ff1c16287499a4e7 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 15 Feb 2018 22:59:47 -0500
Subject: btrfs: use kvzalloc to allocate btrfs_fs_info

The srcu_struct in btrfs_fs_info scales in size with NR_CPUS.  On
kernels built with NR_CPUS=8192, this can result in kmalloc failures
that prevent mounting.

There is work in progress to try to resolve this for every user of
srcu_struct but using kvzalloc will work around the failures until
that is complete.

As an example with NR_CPUS=512 on x86_64: the overall size of
subvol_srcu is 3460 bytes, fs_info is 6496.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 2 +-
 fs/btrfs/super.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a462ab85c49..0f521ba5f2f9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2974,7 +2974,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
 	security_free_mnt_opts(&fs_info->security_opts);
-	kfree(fs_info);
+	kvfree(fs_info);
 }
 
 /* tree mod log functions from ctree.c */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6e71a2a78363..4b817947e00f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1545,7 +1545,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 	 * it for searching for existing supers, so this lets us do that and
 	 * then open_ctree will properly initialize everything later.
 	 */
-	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+	fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
 	if (!fs_info) {
 		error = -ENOMEM;
 		goto error_sec_opts;
-- 
cgit v1.2.3


From ac01f26a27f10aace4bb89fd2c2c05a60c251832 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 8 Jan 2018 10:59:43 +0200
Subject: btrfs: handle failure of add_pending_csums

add_pending_csums was added as part of the new data=ordered
implementation in e6dcd2dc9c48 ("Btrfs: New data=ordered
implementation"). Even back then it called the btrfs_csum_file_blocks
which can fail but it never bothered handling the failure. In ENOMEM
situation this could lead to the filesystem failing to write the
checksums for a particular extent and not detect this. On read this
could lead to the filesystem erroring out due to crc mismatch. Fix it by
propagating failure from add_pending_csums and handling them.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 29b491328f4e..8f7d41fcfbff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2042,12 +2042,15 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, struct list_head *list)
 {
 	struct btrfs_ordered_sum *sum;
+	int ret;
 
 	list_for_each_entry(sum, list, list) {
 		trans->adding_csums = true;
-		btrfs_csum_file_blocks(trans,
+		ret = btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 		trans->adding_csums = false;
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
@@ -3061,7 +3064,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		goto out;
 	}
 
-	add_pending_csums(trans, inode, &ordered_extent->list);
+	ret = add_pending_csums(trans, inode, &ordered_extent->list);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
 
 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 	ret = btrfs_update_inode_fallback(trans, root, inode);
-- 
cgit v1.2.3


From 765f3cebff0023d05d724374db8b63c01e07c499 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 31 Jan 2018 17:14:02 +0200
Subject: btrfs: Handle btrfs_set_extent_delalloc failure in
 relocate_file_extent_cluster

Essentially duplicate the error handling from the above block which
handles the !PageUptodate(page) case and additionally clear
EXTENT_BOUNDARY.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f0c3f00e97cb..cd2298d185dd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3268,8 +3268,22 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			nr++;
 		}
 
-		btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL,
-					  0);
+		ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+						NULL, 0);
+		if (ret) {
+			unlock_page(page);
+			put_page(page);
+			btrfs_delalloc_release_metadata(BTRFS_I(inode),
+							 PAGE_SIZE);
+			btrfs_delalloc_release_extents(BTRFS_I(inode),
+			                               PAGE_SIZE);
+
+			clear_extent_bits(&BTRFS_I(inode)->io_tree,
+					  page_start, page_end,
+					  EXTENT_LOCKED | EXTENT_BOUNDARY);
+			goto out;
+
+		}
 		set_page_dirty(page);
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
-- 
cgit v1.2.3


From 92e222df7b8f05c565009c7383321b593eca488b Mon Sep 17 00:00:00 2001
From: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Date: Mon, 5 Feb 2018 17:45:11 +0100
Subject: btrfs: alloc_chunk: fix DUP stripe size handling

In case of using DUP, we search for enough unallocated disk space on a
device to hold two stripes.

The devices_info[ndevs-1].max_avail that holds the amount of unallocated
space found is directly assigned to stripe_size, while it's actually
twice the stripe size.

Later on in the code, an unconditional division of stripe_size by
dev_stripes corrects the value, but in the meantime there's a check to
see if the stripe_size does not exceed max_chunk_size. Since during this
check stripe_size is twice the amount as intended, the check will reduce
the stripe_size to max_chunk_size if the actual correct to be used
stripe_size is more than half the amount of max_chunk_size.

The unconditional division later tries to correct stripe_size, but will
actually make sure we can't allocate more than half the max_chunk_size.

Fix this by moving the division by dev_stripes before the max chunk size
check, so it always contains the right value, instead of putting a duct
tape division in further on to get it fixed again.

Since in all other cases than DUP, dev_stripes is 1, this change only
affects DUP.

Other attempts in the past were made to fix this:
* 37db63a400 "Btrfs: fix max chunk size check in chunk allocator" tried
to fix the same problem, but still resulted in part of the code acting
on a wrongly doubled stripe_size value.
* 86db25785a "Btrfs: fix max chunk size on raid5/6" unintentionally
broke this fix again.

The real problem was already introduced with the rest of the code in
73c5de0051.

The user visible result however will be that the max chunk size for DUP
will suddenly double, while it's actually acting according to the limits
in the code again like it was 5 years ago.

Reported-by: Naohiro Aota <naohiro.aota@wdc.com>
Link: https://www.spinics.net/lists/linux-btrfs/msg69752.html
Fixes: 73c5de0051 ("btrfs: quasi-round-robin for chunk allocation")
Fixes: 86db25785a ("Btrfs: fix max chunk size on raid5/6")
Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2ceb924ca0d6..b2d05c6b1c56 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4829,10 +4829,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	ndevs = min(ndevs, devs_max);
 
 	/*
-	 * the primary goal is to maximize the number of stripes, so use as many
-	 * devices as possible, even if the stripes are not maximum sized.
+	 * The primary goal is to maximize the number of stripes, so use as
+	 * many devices as possible, even if the stripes are not maximum sized.
+	 *
+	 * The DUP profile stores more than one stripe per device, the
+	 * max_avail is the total size so we have to adjust.
 	 */
-	stripe_size = devices_info[ndevs-1].max_avail;
+	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
 	num_stripes = ndevs * dev_stripes;
 
 	/*
@@ -4867,8 +4870,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			stripe_size = devices_info[ndevs-1].max_avail;
 	}
 
-	stripe_size = div_u64(stripe_size, dev_stripes);
-
 	/* align to BTRFS_STRIPE_LEN */
 	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
 
-- 
cgit v1.2.3


From 3c181c12c431fe33b669410d663beb9cceefcd1b Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 22 Feb 2018 21:58:42 +0800
Subject: btrfs: use proper endianness accessors for super_copy

The fs_info::super_copy is a byte copy of the on-disk structure and all
members must use the accessor macros/functions to obtain the right
value.  This was missing in update_super_roots and in sysfs readers.

Moving between opposite endianness hosts will report bogus numbers in
sysfs, and mount may fail as the root will not be restored correctly. If
the filesystem is always used on a same endian host, this will not be a
problem.

Fix this by using the btrfs_set_super...() functions to set
fs_info::super_copy values, and for the sysfs, use the cached
fs_info::nodesize/sectorsize values.

CC: stable@vger.kernel.org
Fixes: df93589a17378 ("btrfs: export more from FS_INFO to sysfs")
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c       |  8 +++-----
 fs/btrfs/transaction.c | 20 ++++++++++++--------
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a8bafed931f4..d11c70bff5a9 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -423,7 +423,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->nodesize);
 }
 
 BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -433,8 +433,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return snprintf(buf, PAGE_SIZE, "%u\n",
-			fs_info->super_copy->sectorsize);
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->sectorsize);
 }
 
 BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -444,8 +443,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return snprintf(buf, PAGE_SIZE, "%u\n",
-			fs_info->super_copy->sectorsize);
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->sectorsize);
 }
 
 BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04f07144b45c..9220f004001c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1722,19 +1722,23 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
 
 	super = fs_info->super_copy;
 
+	/* update latest btrfs_super_block::chunk_root refs */
 	root_item = &fs_info->chunk_root->root_item;
-	super->chunk_root = root_item->bytenr;
-	super->chunk_root_generation = root_item->generation;
-	super->chunk_root_level = root_item->level;
+	btrfs_set_super_chunk_root(super, root_item->bytenr);
+	btrfs_set_super_chunk_root_generation(super, root_item->generation);
+	btrfs_set_super_chunk_root_level(super, root_item->level);
 
+	/* update latest btrfs_super_block::root refs */
 	root_item = &fs_info->tree_root->root_item;
-	super->root = root_item->bytenr;
-	super->generation = root_item->generation;
-	super->root_level = root_item->level;
+	btrfs_set_super_root(super, root_item->bytenr);
+	btrfs_set_super_generation(super, root_item->generation);
+	btrfs_set_super_root_level(super, root_item->level);
+
 	if (btrfs_test_opt(fs_info, SPACE_CACHE))
-		super->cache_generation = root_item->generation;
+		btrfs_set_super_cache_generation(super, root_item->generation);
 	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
-		super->uuid_tree_generation = root_item->generation;
+		btrfs_set_super_uuid_tree_generation(super,
+						     root_item->generation);
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
-- 
cgit v1.2.3


From d4dfc0f4d39475ccbbac947880b5464a74c30b99 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 6 Feb 2018 20:39:20 +0000
Subject: Btrfs: send, fix issuing write op when processing hole in no data
 mode

When doing an incremental send of a filesystem with the no-holes feature
enabled, we end up issuing a write operation when using the no data mode
send flag, instead of issuing an update extent operation. Fix this by
issuing the update extent operation instead.

Trivial reproducer:

  $ mkfs.btrfs -f -O no-holes /dev/sdc
  $ mkfs.btrfs -f /dev/sdd
  $ mount /dev/sdc /mnt/sdc
  $ mount /dev/sdd /mnt/sdd

  $ xfs_io -f -c "pwrite -S 0xab 0 32K" /mnt/sdc/foobar
  $ btrfs subvolume snapshot -r /mnt/sdc /mnt/sdc/snap1

  $ xfs_io -c "fpunch 8K 8K" /mnt/sdc/foobar
  $ btrfs subvolume snapshot -r /mnt/sdc /mnt/sdc/snap2

  $ btrfs send /mnt/sdc/snap1 | btrfs receive /mnt/sdd
  $ btrfs send --no-data -p /mnt/sdc/snap1 /mnt/sdc/snap2 \
       | btrfs receive -vv /mnt/sdd

Before this change the output of the second receive command is:

  receiving snapshot snap2 uuid=f6922049-8c22-e544-9ff9-fc6755918447...
  utimes
  write foobar, offset 8192, len 8192
  utimes foobar
  BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=f6922049-8c22-e544-9ff9-...

After this change it is:

  receiving snapshot snap2 uuid=564d36a3-ebc8-7343-aec9-bf6fda278e64...
  utimes
  update_extent foobar: offset=8192, len=8192
  utimes foobar
  BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=564d36a3-ebc8-7343-aec9-bf6fda278e64...

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f306c608dc28..484e2af793de 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5005,6 +5005,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
 	u64 len;
 	int ret = 0;
 
+	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+		return send_update_extent(sctx, offset, end - offset);
+
 	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 9a6509c4daa91400b52a5fd541a5521c649a8fea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 28 Feb 2018 15:55:40 +0000
Subject: Btrfs: fix log replay failure after linking special file and fsync

If in the same transaction we rename a special file (fifo, character/block
device or symbolic link), create a hard link for it having its old name
then sync the log, we will end up with a log that can not be replayed and
at when attempting to replay it, an EEXIST error is returned and mounting
the filesystem fails. Example scenario:

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt
  $ mkdir /mnt/testdir
  $ mkfifo /mnt/testdir/foo
  # Make sure everything done so far is durably persisted.
  $ sync

  # Create some unrelated file and fsync it, this is just to create a log
  # tree. The file must be in the same directory as our special file.
  $ touch /mnt/testdir/f1
  $ xfs_io -c "fsync" /mnt/testdir/f1

  # Rename our special file and then create a hard link with its old name.
  $ mv /mnt/testdir/foo /mnt/testdir/bar
  $ ln /mnt/testdir/bar /mnt/testdir/foo

  # Create some other unrelated file and fsync it, this is just to persist
  # the log tree which was modified by the previous rename and link
  # operations. Alternatively we could have modified file f1 and fsync it.
  $ touch /mnt/f2
  $ xfs_io -c "fsync" /mnt/f2

  <power failure>

  $ mount /dev/sdc /mnt
  mount: mount /dev/sdc on /mnt failed: File exists

This happens because when both the log tree and the subvolume's tree have
an entry in the directory "testdir" with the same name, that is, there
is one key (258 INODE_REF 257) in the subvolume tree and another one in
the log tree (where 258 is the inode number of our special file and 257
is the inode for directory "testdir"). Only the data of those two keys
differs, in the subvolume tree the index field for inode reference has
a value of 3 while the log tree it has a value of 5. Because the same key
exists in both trees, but have different index, the log replay fails with
an -EEXIST error when attempting to replay the inode reference from the
log tree.

Fix this by setting the last_unlink_trans field of the inode (our special
file) to the current transaction id when a hard link is created, as this
forces logging the parent directory inode, solving the conflict at log
replay time.

A new generic test case for fstests was also submitted.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 61f20c367aaf..f7a18751314a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5851,7 +5851,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	 * this will force the logging code to walk the dentry chain
 	 * up for the file
 	 */
-	if (S_ISREG(inode->vfs_inode.i_mode))
+	if (!S_ISDIR(inode->vfs_inode.i_mode))
 		inode->last_unlink_trans = trans->transid;
 
 	/*
-- 
cgit v1.2.3


From 1f250e929a9c9332fd6ea34da684afee73837cfe Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 28 Feb 2018 15:56:10 +0000
Subject: Btrfs: fix log replay failure after unlink and link combination

If we have a file with 2 (or more) hard links in the same directory,
remove one of the hard links, create a new file (or link an existing file)
in the same directory with the name of the removed hard link, and then
finally fsync the new file, we end up with a log that fails to replay,
causing a mount failure.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ mkdir /mnt/testdir
  $ touch /mnt/testdir/foo
  $ ln /mnt/testdir/foo /mnt/testdir/bar

  $ sync

  $ unlink /mnt/testdir/bar
  $ touch /mnt/testdir/bar
  $ xfs_io -c "fsync" /mnt/testdir/bar

  <power failure>

  $ mount /dev/sdb /mnt
  mount: mount(2) failed: /mnt: No such file or directory

When replaying the log, for that example, we also see the following in
dmesg/syslog:

  [71813.671307] BTRFS info (device dm-0): failed to delete reference to bar, inode 258 parent 257
  [71813.674204] ------------[ cut here ]------------
  [71813.675694] BTRFS: Transaction aborted (error -2)
  [71813.677236] WARNING: CPU: 1 PID: 13231 at fs/btrfs/inode.c:4128 __btrfs_unlink_inode+0x17b/0x355 [btrfs]
  [71813.679669] Modules linked in: btrfs xfs f2fs dm_flakey dm_mod dax ghash_clmulni_intel ppdev pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse i2c_piix4 parport_pc i2c_core pcspkr sg serio_raw parport button sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel floppy virtio e1000 scsi_mod [last unloaded: btrfs]
  [71813.679669] CPU: 1 PID: 13231 Comm: mount Tainted: G        W        4.15.0-rc9-btrfs-next-56+ #1
  [71813.679669] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
  [71813.679669] RIP: 0010:__btrfs_unlink_inode+0x17b/0x355 [btrfs]
  [71813.679669] RSP: 0018:ffffc90001cef738 EFLAGS: 00010286
  [71813.679669] RAX: 0000000000000025 RBX: ffff880217ce4708 RCX: 0000000000000001
  [71813.679669] RDX: 0000000000000000 RSI: ffffffff81c14bae RDI: 00000000ffffffff
  [71813.679669] RBP: ffffc90001cef7c0 R08: 0000000000000001 R09: 0000000000000001
  [71813.679669] R10: ffffc90001cef5e0 R11: ffffffff8343f007 R12: ffff880217d474c8
  [71813.679669] R13: 00000000fffffffe R14: ffff88021ccf1548 R15: 0000000000000101
  [71813.679669] FS:  00007f7cee84c480(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000
  [71813.679669] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [71813.679669] CR2: 00007f7cedc1abf9 CR3: 00000002354b4003 CR4: 00000000001606e0
  [71813.679669] Call Trace:
  [71813.679669]  btrfs_unlink_inode+0x17/0x41 [btrfs]
  [71813.679669]  drop_one_dir_item+0xfa/0x131 [btrfs]
  [71813.679669]  add_inode_ref+0x71e/0x851 [btrfs]
  [71813.679669]  ? __lock_is_held+0x39/0x71
  [71813.679669]  ? replay_one_buffer+0x53/0x53a [btrfs]
  [71813.679669]  replay_one_buffer+0x4a4/0x53a [btrfs]
  [71813.679669]  ? rcu_read_unlock+0x3a/0x57
  [71813.679669]  ? __lock_is_held+0x39/0x71
  [71813.679669]  walk_up_log_tree+0x101/0x1d2 [btrfs]
  [71813.679669]  walk_log_tree+0xad/0x188 [btrfs]
  [71813.679669]  btrfs_recover_log_trees+0x1fa/0x31e [btrfs]
  [71813.679669]  ? replay_one_extent+0x544/0x544 [btrfs]
  [71813.679669]  open_ctree+0x1cf6/0x2209 [btrfs]
  [71813.679669]  btrfs_mount_root+0x368/0x482 [btrfs]
  [71813.679669]  ? trace_hardirqs_on_caller+0x14c/0x1a6
  [71813.679669]  ? __lockdep_init_map+0x176/0x1c2
  [71813.679669]  ? mount_fs+0x64/0x10b
  [71813.679669]  mount_fs+0x64/0x10b
  [71813.679669]  vfs_kern_mount+0x68/0xce
  [71813.679669]  btrfs_mount+0x13e/0x772 [btrfs]
  [71813.679669]  ? trace_hardirqs_on_caller+0x14c/0x1a6
  [71813.679669]  ? __lockdep_init_map+0x176/0x1c2
  [71813.679669]  ? mount_fs+0x64/0x10b
  [71813.679669]  mount_fs+0x64/0x10b
  [71813.679669]  vfs_kern_mount+0x68/0xce
  [71813.679669]  do_mount+0x6e5/0x973
  [71813.679669]  ? memdup_user+0x3e/0x5c
  [71813.679669]  SyS_mount+0x72/0x98
  [71813.679669]  entry_SYSCALL_64_fastpath+0x1e/0x8b
  [71813.679669] RIP: 0033:0x7f7cedf150ba
  [71813.679669] RSP: 002b:00007ffca71da688 EFLAGS: 00000206
  [71813.679669] Code: 7f a0 e8 51 0c fd ff 48 8b 43 50 f0 0f ba a8 30 2c 00 00 02 72 17 41 83 fd fb 74 11 44 89 ee 48 c7 c7 7d 11 7f a0 e8 38 f5 8d e0 <0f> ff 44 89 e9 ba 20 10 00 00 eb 4d 48 8b 4d b0 48 8b 75 88 4c
  [71813.679669] ---[ end trace 83bd473fc5b4663b ]---
  [71813.854764] BTRFS: error (device dm-0) in __btrfs_unlink_inode:4128: errno=-2 No such entry
  [71813.886994] BTRFS: error (device dm-0) in btrfs_replay_log:2307: errno=-2 No such entry (Failed to recover log tree)
  [71813.903357] BTRFS error (device dm-0): cleaner transaction attach returned -30
  [71814.128078] BTRFS error (device dm-0): open_ctree failed

This happens because the log has inode reference items for both inode 258
(the first file we created) and inode 259 (the second file created), and
when processing the reference item for inode 258, we replace the
corresponding item in the subvolume tree (which has two names, "foo" and
"bar") witht he one in the log (which only has one name, "foo") without
removing the corresponding dir index keys from the parent directory.
Later, when processing the inode reference item for inode 259, which has
a name of "bar" associated to it, we notice that dir index entries exist
for that name and for a different inode, so we attempt to unlink that
name, which fails because the inode reference item for inode 258 no longer
has the name "bar" associated to it, making a call to btrfs_unlink_inode()
fail with a -ENOENT error.

Fix this by unlinking all the names in an inode reference item from a
subvolume tree that are not present in the inode reference item found in
the log tree, before overwriting it with the item from the log tree.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h      |   5 ++-
 fs/btrfs/inode-item.c |  44 ++++++++++++--------
 fs/btrfs/tree-log.c   | 112 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 139 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0f521ba5f2f9..da308774b8a4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3095,7 +3095,10 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 			  u64 inode_objectid, u64 ref_objectid, int ins_len,
 			  int cow);
 
-int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
+			       const char *name,
+			       int name_len, struct btrfs_inode_ref **ref_ret);
+int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
 				   u64 ref_objectid, const char *name,
 				   int name_len,
 				   struct btrfs_inode_extref **extref_ret);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 39c968f80157..65e1a76bf755 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -22,10 +22,10 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-static int find_name_in_backref(struct btrfs_path *path, const char *name,
-			 int name_len, struct btrfs_inode_ref **ref_ret)
+int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
+			       const char *name,
+			       int name_len, struct btrfs_inode_ref **ref_ret)
 {
-	struct extent_buffer *leaf;
 	struct btrfs_inode_ref *ref;
 	unsigned long ptr;
 	unsigned long name_ptr;
@@ -33,9 +33,8 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
 	u32 cur_offset = 0;
 	int len;
 
-	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	item_size = btrfs_item_size_nr(leaf, slot);
+	ptr = btrfs_item_ptr_offset(leaf, slot);
 	while (cur_offset < item_size) {
 		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
 		len = btrfs_inode_ref_name_len(leaf, ref);
@@ -44,18 +43,19 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
 		if (len != name_len)
 			continue;
 		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
-			*ref_ret = ref;
+			if (ref_ret)
+				*ref_ret = ref;
 			return 1;
 		}
 	}
 	return 0;
 }
 
-int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
+				   u64 ref_objectid,
 				   const char *name, int name_len,
 				   struct btrfs_inode_extref **extref_ret)
 {
-	struct extent_buffer *leaf;
 	struct btrfs_inode_extref *extref;
 	unsigned long ptr;
 	unsigned long name_ptr;
@@ -63,9 +63,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
 	u32 cur_offset = 0;
 	int ref_name_len;
 
-	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	item_size = btrfs_item_size_nr(leaf, slot);
+	ptr = btrfs_item_ptr_offset(leaf, slot);
 
 	/*
 	 * Search all extended backrefs in this item. We're only
@@ -113,7 +112,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	if (ret > 0)
 		return NULL;
-	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+	if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+					    ref_objectid, name, name_len,
+					    &extref))
 		return NULL;
 	return extref;
 }
@@ -155,7 +156,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	 * This should always succeed so error here will make the FS
 	 * readonly.
 	 */
-	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+	if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+					    ref_objectid,
 					    name, name_len, &extref)) {
 		btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
 		ret = -EROFS;
@@ -225,7 +227,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	} else if (ret < 0) {
 		goto out;
 	}
-	if (!find_name_in_backref(path, name, name_len, &ref)) {
+	if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+					name, name_len, &ref)) {
 		ret = -ENOENT;
 		search_ext_refs = 1;
 		goto out;
@@ -293,7 +296,9 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      ins_len);
 	if (ret == -EEXIST) {
-		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+		if (btrfs_find_name_in_ext_backref(path->nodes[0],
+						   path->slots[0],
+						   ref_objectid,
 						   name, name_len, NULL))
 			goto out;
 
@@ -351,7 +356,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	if (ret == -EEXIST) {
 		u32 old_size;
 
-		if (find_name_in_backref(path, name, name_len, &ref))
+		if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+					       name, name_len, &ref))
 			goto out;
 
 		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
@@ -365,7 +371,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ret = 0;
 	} else if (ret < 0) {
 		if (ret == -EOVERFLOW) {
-			if (find_name_in_backref(path, name, name_len, &ref))
+			if (btrfs_find_name_in_backref(path->nodes[0],
+						       path->slots[0],
+						       name, name_len, &ref))
 				ret = -EEXIST;
 			else
 				ret = -EMLINK;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f7a18751314a..4c50f823949c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -966,7 +966,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 
 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
-		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+		if (btrfs_find_name_in_ext_backref(path->nodes[0],
+						   path->slots[0],
+						   ref_objectid,
 						   name, namelen, NULL))
 			match = 1;
 
@@ -1190,7 +1192,8 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
 			   *namelen);
 
-	*index = btrfs_inode_extref_index(eb, extref);
+	if (index)
+		*index = btrfs_inode_extref_index(eb, extref);
 	if (parent_objectid)
 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
 
@@ -1211,11 +1214,101 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
 
 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
 
-	*index = btrfs_inode_ref_index(eb, ref);
+	if (index)
+		*index = btrfs_inode_ref_index(eb, ref);
 
 	return 0;
 }
 
+/*
+ * Take an inode reference item from the log tree and iterate all names from the
+ * inode reference item in the subvolume tree with the same key (if it exists).
+ * For any name that is not in the inode reference item from the log tree, do a
+ * proper unlink of that name (that is, remove its entry from the inode
+ * reference item and both dir index keys).
+ */
+static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_inode *inode,
+				 struct extent_buffer *log_eb,
+				 int log_slot,
+				 struct btrfs_key *key)
+{
+	int ret;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+	struct extent_buffer *eb;
+
+again:
+	btrfs_release_path(path);
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+	while (ref_ptr < ref_end) {
+		char *name = NULL;
+		int namelen;
+		u64 parent_id;
+
+		if (key->type == BTRFS_INODE_EXTREF_KEY) {
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						NULL, &parent_id);
+		} else {
+			parent_id = key->offset;
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     NULL);
+		}
+		if (ret)
+			goto out;
+
+		if (key->type == BTRFS_INODE_EXTREF_KEY)
+			ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
+							     parent_id, name,
+							     namelen, NULL);
+		else
+			ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
+							 namelen, NULL);
+
+		if (!ret) {
+			struct inode *dir;
+
+			btrfs_release_path(path);
+			dir = read_one_inode(root, parent_id);
+			if (!dir) {
+				ret = -ENOENT;
+				kfree(name);
+				goto out;
+			}
+			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+						 inode, name, namelen);
+			kfree(name);
+			iput(dir);
+			if (ret)
+				goto out;
+			goto again;
+		}
+
+		kfree(name);
+		ref_ptr += namelen;
+		if (key->type == BTRFS_INODE_EXTREF_KEY)
+			ref_ptr += sizeof(struct btrfs_inode_extref);
+		else
+			ref_ptr += sizeof(struct btrfs_inode_ref);
+	}
+	ret = 0;
+ out:
+	btrfs_release_path(path);
+	return ret;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1344,6 +1437,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	/*
+	 * Before we overwrite the inode reference item in the subvolume tree
+	 * with the item from the log tree, we must unlink all names from the
+	 * parent directory that are in the subvolume's tree inode reference
+	 * item, otherwise we end up with an inconsistent subvolume tree where
+	 * dir index entries exist for a name but there is no inode reference
+	 * item with the same name.
+	 */
+	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
+				    key);
+	if (ret)
+		goto out;
+
 	/* finally write the back reference in the inode */
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 out:
-- 
cgit v1.2.3


From 1c789249578895bb14ab62b4327306439b754857 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@icloud.com>
Date: Thu, 1 Mar 2018 14:24:51 +0800
Subject: ceph: fix potential memory leak in init_caches()

There is lack of cache destroy operation for ceph_file_cachep
when failing from fscache register.

Signed-off-by: Chengguang Xu <cgxu519@icloud.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1c470b453a9e..fb2bc9c15a23 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -713,14 +713,17 @@ static int __init init_caches(void)
 		goto bad_dentry;
 
 	ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
-
 	if (!ceph_file_cachep)
 		goto bad_file;
 
-	if ((error = ceph_fscache_register()))
-		goto bad_file;
+	error = ceph_fscache_register();
+	if (error)
+		goto bad_fscache;
 
 	return 0;
+
+bad_fscache:
+	kmem_cache_destroy(ceph_file_cachep);
 bad_file:
 	kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
-- 
cgit v1.2.3