152 files changed, 4744 insertions, 3582 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 2bc7ad775842..3ef62bad8f2b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
 	default y
+	select PERCPU_RWSEM
 	help
 	  This option enables standard file locking support, required
           for filesystems like NFS and for the flock() system
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c7efddf6e038..4c09d93d9569 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -89,7 +89,7 @@ config BINFMT_SCRIPT
 
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
-	depends on !MMU || M68K
+	depends on !MMU || ARM || M68K
 	depends on !FRV || BROKEN
 	help
 	  Support uClinux FLAT format binaries.
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 7ef637d7f3a5..1e9d2f84e5b5 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -461,8 +461,8 @@ static void afs_callback_updater(struct work_struct *work)
  */
 int __init afs_callback_update_init(void)
 {
-	afs_callback_update_worker =
-		create_singlethread_workqueue("kafs_callbackd");
+	afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd",
+							     WQ_MEM_RECLAIM);
 	return afs_callback_update_worker ? 0 : -ENOMEM;
 }
 
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 85737e96ab8b..2037e7a77a37 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -17,19 +17,12 @@
 #include "internal.h"
 #include "afs_cm.h"
 
-#if 0
-struct workqueue_struct *afs_cm_workqueue;
-#endif  /*  0  */
-
-static int afs_deliver_cb_init_call_back_state(struct afs_call *,
-					       struct sk_buff *, bool);
-static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
-						struct sk_buff *, bool);
-static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *,
-						 struct sk_buff *, bool);
+static int afs_deliver_cb_init_call_back_state(struct afs_call *);
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
+static int afs_deliver_cb_probe(struct afs_call *);
+static int afs_deliver_cb_callback(struct afs_call *);
+static int afs_deliver_cb_probe_uuid(struct afs_call *);
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *);
 static void afs_cm_destructor(struct afs_call *);
 
 /*
@@ -134,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call)
 	 * received.  The step number here must match the final number in
 	 * afs_deliver_cb_callback().
 	 */
-	if (call->unmarshall == 6) {
+	if (call->unmarshall == 5) {
 		ASSERT(call->server && call->count && call->request);
 		afs_break_callbacks(call->server, call->count, call->request);
 	}
@@ -168,27 +161,27 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 /*
  * deliver request data to a CB.CallBack call
  */
-static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
-				   bool last)
+static int afs_deliver_cb_callback(struct afs_call *call)
 {
+	struct sockaddr_rxrpc srx;
 	struct afs_callback *cb;
 	struct afs_server *server;
-	struct in_addr addr;
 	__be32 *bp;
 	u32 tmp;
 	int ret, loop;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
 	case 0:
+		rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
 		call->offset = 0;
 		call->unmarshall++;
 
 		/* extract the FID array and its count in two steps */
 	case 1:
 		_debug("extract FID count");
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -205,8 +198,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 
 	case 2:
 		_debug("extract FID array");
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       call->count * 3 * 4);
+		ret = afs_extract_data(call, call->buffer,
+				       call->count * 3 * 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -232,7 +225,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		/* extract the callback array and its count in two steps */
 	case 3:
 		_debug("extract CB count");
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -242,13 +235,11 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 			return -EBADMSG;
 		call->offset = 0;
 		call->unmarshall++;
-		if (tmp == 0)
-			goto empty_cb_array;
 
 	case 4:
 		_debug("extract CB array");
-		ret = afs_extract_data(call, skb, last, call->request,
-				       call->count * 3 * 4);
+		ret = afs_extract_data(call, call->buffer,
+				       call->count * 3 * 4, false);
 		if (ret < 0)
 			return ret;
 
@@ -261,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 			cb->type	= ntohl(*bp++);
 		}
 
-	empty_cb_array:
 		call->offset = 0;
 		call->unmarshall++;
 
-	case 5:
-		ret = afs_data_complete(call, skb, last);
-		if (ret < 0)
-			return ret;
-
 		/* Record that the message was unmarshalled successfully so
 		 * that the call destructor can know do the callback breaking
 		 * work, even if the final ACK isn't received.
@@ -278,17 +263,15 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		 * updated also.
 		 */
 		call->unmarshall++;
-	case 6:
+	case 5:
 		break;
 	}
 
-
 	call->state = AFS_CALL_REPLYING;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
-	server = afs_find_server(&addr);
+	server = afs_find_server(&srx);
 	if (!server)
 		return -ENOTCONN;
 	call->server = server;
@@ -315,17 +298,17 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 /*
  * deliver request data to a CB.InitCallBackState call
  */
-static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
-					       struct sk_buff *skb,
-					       bool last)
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 {
+	struct sockaddr_rxrpc srx;
 	struct afs_server *server;
-	struct in_addr addr;
 	int ret;
 
-	_enter(",{%u},%d", skb->len, last);
+	_enter("");
+
+	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
 
-	ret = afs_data_complete(call, skb, last);
+	ret = afs_extract_data(call, NULL, 0, false);
 	if (ret < 0)
 		return ret;
 
@@ -334,8 +317,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
-	server = afs_find_server(&addr);
+	server = afs_find_server(&srx);
 	if (!server)
 		return -ENOTCONN;
 	call->server = server;
@@ -348,27 +330,68 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
 /*
  * deliver request data to a CB.InitCallBackState3 call
  */
-static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
-						struct sk_buff *skb,
-						bool last)
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
+	struct sockaddr_rxrpc srx;
 	struct afs_server *server;
-	struct in_addr addr;
+	struct afs_uuid *r;
+	unsigned loop;
+	__be32 *b;
+	int ret;
+
+	_enter("");
+
+	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+
+	_enter("{%u}", call->unmarshall);
 
-	_enter(",{%u},%d", skb->len, last);
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->unmarshall++;
 
-	/* There are some arguments that we ignore */
-	afs_data_consumed(call, skb);
-	if (!last)
-		return -EAGAIN;
+	case 1:
+		_debug("extract UUID");
+		ret = afs_extract_data(call, call->buffer,
+				       11 * sizeof(__be32), false);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall UUID");
+		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		b = call->buffer;
+		r = call->request;
+		r->time_low			= ntohl(b[0]);
+		r->time_mid			= ntohl(b[1]);
+		r->time_hi_and_version		= ntohl(b[2]);
+		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
+		r->clock_seq_low		= ntohl(b[4]);
+
+		for (loop = 0; loop < 6; loop++)
+			r->node[loop] = ntohl(b[loop + 5]);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		break;
+	}
 
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
-	server = afs_find_server(&addr);
+	server = afs_find_server(&srx);
 	if (!server)
 		return -ENOTCONN;
 	call->server = server;
@@ -393,14 +416,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
 /*
  * deliver request data to a CB.Probe call
  */
-static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
-				bool last)
+static int afs_deliver_cb_probe(struct afs_call *call)
 {
 	int ret;
 
-	_enter(",{%u},%d", skb->len, last);
+	_enter("");
 
-	ret = afs_data_complete(call, skb, last);
+	ret = afs_extract_data(call, NULL, 0, false);
 	if (ret < 0)
 		return ret;
 
@@ -426,7 +448,6 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 
 	_enter("");
 
-
 	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
 		reply.match = htonl(0);
 	else
@@ -439,19 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 /*
  * deliver request data to a CB.ProbeUuid call
  */
-static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
-				     bool last)
+static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 {
 	struct afs_uuid *r;
 	unsigned loop;
 	__be32 *b;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-
-	ret = afs_data_complete(call, skb, last);
-	if (ret < 0)
-		return ret;
+	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
 	case 0:
@@ -463,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
 
 	case 1:
 		_debug("extract UUID");
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       11 * sizeof(__be32));
+		ret = afs_extract_data(call, call->buffer,
+				       11 * sizeof(__be32), false);
 		switch (ret) {
 		case 0:		break;
 		case -EAGAIN:	return 0;
@@ -491,16 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
 		call->unmarshall++;
 
 	case 2:
-		_debug("trailer");
-		if (skb->len != 0)
-			return -EBADMSG;
 		break;
 	}
 
-	ret = afs_data_complete(call, skb, last);
-	if (ret < 0)
-		return ret;
-
 	call->state = AFS_CALL_REPLYING;
 
 	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
@@ -574,14 +583,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 /*
  * deliver request data to a CB.TellMeAboutYourself call
  */
-static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
-						 struct sk_buff *skb, bool last)
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
 {
 	int ret;
 
-	_enter(",{%u},%d", skb->len, last);
+	_enter("");
 
-	ret = afs_data_complete(call, skb, last);
+	ret = afs_extract_data(call, NULL, 0, false);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index d91a9c9cfbd0..3191dff2c156 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -36,8 +36,8 @@ static int afs_init_lock_manager(void)
 	if (!afs_lock_manager) {
 		mutex_lock(&afs_lock_manager_mutex);
 		if (!afs_lock_manager) {
-			afs_lock_manager =
-				create_singlethread_workqueue("kafs_lockd");
+			afs_lock_manager = alloc_workqueue("kafs_lockd",
+							   WQ_MEM_RECLAIM, 0);
 			if (!afs_lock_manager)
 				ret = -ENOMEM;
 		}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 9312b92e54be..96f4d764d1a6 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -235,16 +235,15 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
 /*
  * deliver reply data to an FS.FetchStatus
  */
-static int afs_deliver_fs_fetch_status(struct afs_call *call,
-				       struct sk_buff *skb, bool last)
+static int afs_deliver_fs_fetch_status(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
 	int ret;
 
-	_enter(",,%u", last);
+	_enter("");
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -307,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 /*
  * deliver reply data to an FS.FetchData
  */
-static int afs_deliver_fs_fetch_data(struct afs_call *call,
-				     struct sk_buff *skb, bool last)
+static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
@@ -316,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	void *buffer;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
 	case 0:
@@ -332,7 +330,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		 * client) */
 	case 1:
 		_debug("extract data length (MSW)");
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -347,7 +345,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		/* extract the returned data length */
 	case 2:
 		_debug("extract data length");
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -363,10 +361,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		_debug("extract data");
 		if (call->count > 0) {
 			page = call->reply3;
-			buffer = kmap_atomic(page);
-			ret = afs_extract_data(call, skb, last, buffer,
-					       call->count);
-			kunmap_atomic(buffer);
+			buffer = kmap(page);
+			ret = afs_extract_data(call, buffer,
+					       call->count, true);
+			kunmap(buffer);
 			if (ret < 0)
 				return ret;
 		}
@@ -376,8 +374,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 
 		/* extract the metadata */
 	case 4:
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       (21 + 3 + 6) * 4);
+		ret = afs_extract_data(call, call->buffer,
+				       (21 + 3 + 6) * 4, false);
 		if (ret < 0)
 			return ret;
 
@@ -391,18 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		call->unmarshall++;
 
 	case 5:
-		ret = afs_data_complete(call, skb, last);
-		if (ret < 0)
-			return ret;
 		break;
 	}
 
 	if (call->count < PAGE_SIZE) {
 		_debug("clear");
 		page = call->reply3;
-		buffer = kmap_atomic(page);
+		buffer = kmap(page);
 		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
-		kunmap_atomic(buffer);
+		kunmap(buffer);
 	}
 
 	_leave(" = 0 [done]");
@@ -515,13 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server,
 /*
  * deliver reply data to an FS.GiveUpCallBacks
  */
-static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
-					    struct sk_buff *skb, bool last)
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call)
 {
-	_enter(",{%u},%d", skb->len, last);
+	_enter("");
 
 	/* shouldn't be any reply data */
-	return afs_data_complete(call, skb, last);
+	return afs_extract_data(call, NULL, 0, false);
 }
 
 /*
@@ -599,16 +593,15 @@ int afs_fs_give_up_callbacks(struct afs_server *server,
 /*
  * deliver reply data to an FS.CreateFile or an FS.MakeDir
  */
-static int afs_deliver_fs_create_vnode(struct afs_call *call,
-				       struct sk_buff *skb, bool last)
+static int afs_deliver_fs_create_vnode(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -696,16 +689,15 @@ int afs_fs_create(struct afs_server *server,
 /*
  * deliver reply data to an FS.RemoveFile or FS.RemoveDir
  */
-static int afs_deliver_fs_remove(struct afs_call *call,
-				 struct sk_buff *skb, bool last)
+static int afs_deliver_fs_remove(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -777,16 +769,15 @@ int afs_fs_remove(struct afs_server *server,
 /*
  * deliver reply data to an FS.Link
  */
-static int afs_deliver_fs_link(struct afs_call *call,
-			       struct sk_buff *skb, bool last)
+static int afs_deliver_fs_link(struct afs_call *call)
 {
 	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -863,16 +854,15 @@ int afs_fs_link(struct afs_server *server,
 /*
  * deliver reply data to an FS.Symlink
  */
-static int afs_deliver_fs_symlink(struct afs_call *call,
-				  struct sk_buff *skb, bool last)
+static int afs_deliver_fs_symlink(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -968,16 +958,15 @@ int afs_fs_symlink(struct afs_server *server,
 /*
  * deliver reply data to an FS.Rename
  */
-static int afs_deliver_fs_rename(struct afs_call *call,
-				  struct sk_buff *skb, bool last)
+static int afs_deliver_fs_rename(struct afs_call *call)
 {
 	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -1072,16 +1061,15 @@ int afs_fs_rename(struct afs_server *server,
 /*
  * deliver reply data to an FS.StoreData
  */
-static int afs_deliver_fs_store_data(struct afs_call *call,
-				     struct sk_buff *skb, bool last)
+static int afs_deliver_fs_store_data(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
 	int ret;
 
-	_enter(",,%u", last);
+	_enter("");
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -1251,17 +1239,16 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 /*
  * deliver reply data to an FS.StoreStatus
  */
-static int afs_deliver_fs_store_status(struct afs_call *call,
-				       struct sk_buff *skb, bool last)
+static int afs_deliver_fs_store_status(struct afs_call *call)
 {
 	afs_dataversion_t *store_version;
 	struct afs_vnode *vnode = call->reply;
 	const __be32 *bp;
 	int ret;
 
-	_enter(",,%u", last);
+	_enter("");
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
@@ -1443,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 /*
  * deliver reply data to an FS.GetVolumeStatus
  */
-static int afs_deliver_fs_get_volume_status(struct afs_call *call,
-					    struct sk_buff *skb, bool last)
+static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 {
 	const __be32 *bp;
 	char *p;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
 	case 0:
@@ -1460,8 +1446,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		/* extract the returned status record */
 	case 1:
 		_debug("extract status");
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       12 * 4);
+		ret = afs_extract_data(call, call->buffer,
+				       12 * 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -1472,7 +1458,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 
 		/* extract the volume name length */
 	case 2:
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -1487,8 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 	case 3:
 		_debug("extract volname");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, skb, last, call->reply3,
-					       call->count);
+			ret = afs_extract_data(call, call->reply3,
+					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
@@ -1508,8 +1494,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		call->count = 4 - (call->count & 3);
 
 	case 4:
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       call->count);
+		ret = afs_extract_data(call, call->buffer,
+				       call->count, true);
 		if (ret < 0)
 			return ret;
 
@@ -1519,7 +1505,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 
 		/* extract the offline message length */
 	case 5:
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -1534,8 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 	case 6:
 		_debug("extract offline");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, skb, last, call->reply3,
-					       call->count);
+			ret = afs_extract_data(call, call->reply3,
+					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
@@ -1555,8 +1541,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		call->count = 4 - (call->count & 3);
 
 	case 7:
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       call->count);
+		ret = afs_extract_data(call, call->buffer,
+				       call->count, true);
 		if (ret < 0)
 			return ret;
 
@@ -1566,7 +1552,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 
 		/* extract the message of the day length */
 	case 8:
-		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		ret = afs_extract_data(call, &call->tmp, 4, true);
 		if (ret < 0)
 			return ret;
 
@@ -1581,8 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 	case 9:
 		_debug("extract motd");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, skb, last, call->reply3,
-					       call->count);
+			ret = afs_extract_data(call, call->reply3,
+					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
@@ -1595,26 +1581,17 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
 		call->unmarshall++;
 
 		/* extract the message of the day padding */
-		if ((call->count & 3) == 0) {
-			call->unmarshall++;
-			goto no_motd_padding;
-		}
-		call->count = 4 - (call->count & 3);
+		call->count = (4 - (call->count & 3)) & 3;
 
 	case 10:
-		ret = afs_extract_data(call, skb, last, call->buffer,
-				       call->count);
+		ret = afs_extract_data(call, call->buffer,
+				       call->count, false);
 		if (ret < 0)
 			return ret;
 
 		call->offset = 0;
 		call->unmarshall++;
-	no_motd_padding:
-
 	case 11:
-		ret = afs_data_complete(call, skb, last);
-		if (ret < 0)
-			return ret;
 		break;
 	}
 
@@ -1685,15 +1662,14 @@ int afs_fs_get_volume_status(struct afs_server *server,
 /*
  * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
  */
-static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
-				    struct sk_buff *skb, bool last)
+static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
 {
 	const __be32 *bp;
 	int ret;
 
-	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+	_enter("{%u}", call->unmarshall);
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index df976b2a7f40..5497c8496055 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -13,13 +13,13 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/skbuff.h>
 #include <linux/rxrpc.h>
 #include <linux/key.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/fscache.h>
 #include <linux/backing-dev.h>
+#include <net/af_rxrpc.h>
 
 #include "afs.h"
 #include "afs_vl.h"
@@ -56,7 +56,7 @@ struct afs_mount_params {
  */
 struct afs_wait_mode {
 	/* RxRPC received message notification */
-	void (*rx_wakeup)(struct afs_call *call);
+	rxrpc_notify_rx_t notify_rx;
 
 	/* synchronous call waiter and call dispatched notification */
 	int (*wait)(struct afs_call *call);
@@ -75,10 +75,8 @@ struct afs_call {
 	const struct afs_call_type *type;	/* type of call */
 	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
 	wait_queue_head_t	waitq;		/* processes awaiting completion */
-	void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
 	struct work_struct	async_work;	/* asynchronous work processor */
 	struct work_struct	work;		/* actual work processor */
-	struct sk_buff_head	rx_queue;	/* received packets */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct key		*key;		/* security for this call */
 	struct afs_server	*server;	/* server affected by incoming CM call */
@@ -92,6 +90,7 @@ struct afs_call {
 	void			*reply4;	/* reply buffer (fourth part) */
 	pgoff_t			first;		/* first page in mapping to deal with */
 	pgoff_t			last;		/* last page in mapping to deal with */
+	size_t			offset;		/* offset into received data store */
 	enum {					/* call state */
 		AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
 		AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
@@ -99,21 +98,18 @@ struct afs_call {
 		AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
 		AFS_CALL_REPLYING,	/* replying to incoming call */
 		AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
-		AFS_CALL_COMPLETE,	/* successfully completed */
-		AFS_CALL_BUSY,		/* server was busy */
-		AFS_CALL_ABORTED,	/* call was aborted */
-		AFS_CALL_ERROR,		/* call failed due to error */
+		AFS_CALL_COMPLETE,	/* Completed or failed */
 	}			state;
 	int			error;		/* error code */
+	u32			abort_code;	/* Remote abort ID or 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
-	unsigned		reply_size;	/* current size of reply */
 	unsigned		first_offset;	/* offset into mapping[first] */
 	unsigned		last_to;	/* amount of mapping[last] */
-	unsigned		offset;		/* offset into received data store */
 	unsigned char		unmarshall;	/* unmarshalling phase */
 	bool			incoming;	/* T if incoming call */
 	bool			send_pages;	/* T if data from mapping should be sent */
+	bool			need_attention;	/* T if RxRPC poked us */
 	u16			service_id;	/* RxRPC service ID to call */
 	__be16			port;		/* target UDP port */
 	__be32			operation_ID;	/* operation ID for an incoming call */
@@ -128,8 +124,7 @@ struct afs_call_type {
 	/* deliver request or reply data to an call
 	 * - returning an error will cause the call to be aborted
 	 */
-	int (*deliver)(struct afs_call *call, struct sk_buff *skb,
-		       bool last);
+	int (*deliver)(struct afs_call *call);
 
 	/* map an abort code to an error number */
 	int (*abort_to_error)(u32 abort_code);
@@ -607,29 +602,22 @@ extern void afs_proc_cell_remove(struct afs_cell *);
 /*
  * rxrpc.c
  */
+extern struct socket *afs_socket;
+
 extern int afs_open_socket(void);
 extern void afs_close_socket(void);
-extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
 extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
 			 const struct afs_wait_mode *);
 extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
-extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
 extern void afs_send_empty_reply(struct afs_call *);
 extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
-extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
-			    size_t);
+extern int afs_extract_data(struct afs_call *, void *, size_t, bool);
 
-static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
-				    bool last)
+static inline int afs_transfer_reply(struct afs_call *call)
 {
-	if (skb->len > 0)
-		return -EBADMSG;
-	afs_data_consumed(call, skb);
-	if (!last)
-		return -EAGAIN;
-	return 0;
+	return afs_extract_data(call, call->buffer, call->reply_max, false);
 }
 
 /*
@@ -654,7 +642,7 @@ do {								\
 
 extern struct afs_server *afs_lookup_server(struct afs_cell *,
 					    const struct in_addr *);
-extern struct afs_server *afs_find_server(const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *);
 extern void afs_put_server(struct afs_server *);
 extern void __exit afs_purge_servers(void);
 
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 35de0c04729f..0b187ef3b5b7 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/sched.h>
+#include <linux/random.h>
 #include "internal.h"
 
 MODULE_DESCRIPTION("AFS Client File System");
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 14d04c848465..59bdaa7527b6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -16,34 +16,36 @@
 #include "internal.h"
 #include "afs_cm.h"
 
-static struct socket *afs_socket; /* my RxRPC socket */
+struct socket *afs_socket; /* my RxRPC socket */
 static struct workqueue_struct *afs_async_calls;
+static struct afs_call *afs_spare_incoming_call;
 static atomic_t afs_outstanding_calls;
-static atomic_t afs_outstanding_skbs;
 
-static void afs_wake_up_call_waiter(struct afs_call *);
+static void afs_free_call(struct afs_call *);
+static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
 static int afs_wait_for_call_to_complete(struct afs_call *);
-static void afs_wake_up_async_call(struct afs_call *);
+static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
 static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct afs_call *);
-static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
-static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
+static int afs_deliver_cm_op_id(struct afs_call *);
 
 /* synchronous call management */
 const struct afs_wait_mode afs_sync_call = {
-	.rx_wakeup	= afs_wake_up_call_waiter,
+	.notify_rx	= afs_wake_up_call_waiter,
 	.wait		= afs_wait_for_call_to_complete,
 };
 
 /* asynchronous call management */
 const struct afs_wait_mode afs_async_call = {
-	.rx_wakeup	= afs_wake_up_async_call,
+	.notify_rx	= afs_wake_up_async_call,
 	.wait		= afs_dont_wait_for_call_to_complete,
 };
 
 /* asynchronous incoming call management */
 static const struct afs_wait_mode afs_async_incoming_call = {
-	.rx_wakeup	= afs_wake_up_async_call,
+	.notify_rx	= afs_wake_up_async_call,
 };
 
 /* asynchronous incoming call initial processing */
@@ -53,17 +55,9 @@ static const struct afs_call_type afs_RXCMxxxx = {
 	.abort_to_error	= afs_abort_to_error,
 };
 
-static void afs_collect_incoming_call(struct work_struct *);
+static void afs_charge_preallocation(struct work_struct *);
 
-static struct sk_buff_head afs_incoming_calls;
-static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
-
-static void afs_async_workfn(struct work_struct *work)
-{
-	struct afs_call *call = container_of(work, struct afs_call, async_work);
-
-	call->async_workfn(call);
-}
+static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
 static int afs_wait_atomic_t(atomic_t *p)
 {
@@ -83,10 +77,8 @@ int afs_open_socket(void)
 
 	_enter("");
 
-	skb_queue_head_init(&afs_incoming_calls);
-
 	ret = -ENOMEM;
-	afs_async_calls = create_singlethread_workqueue("kafsd");
+	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
 	if (!afs_async_calls)
 		goto error_0;
 
@@ -110,13 +102,15 @@ int afs_open_socket(void)
 	if (ret < 0)
 		goto error_2;
 
+	rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
+					   afs_rx_discard_new_call);
+
 	ret = kernel_listen(socket, INT_MAX);
 	if (ret < 0)
 		goto error_2;
 
-	rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
-
 	afs_socket = socket;
+	afs_charge_preallocation(NULL);
 	_leave(" = 0");
 	return 0;
 
@@ -136,52 +130,28 @@ void afs_close_socket(void)
 {
 	_enter("");
 
+	if (afs_spare_incoming_call) {
+		atomic_inc(&afs_outstanding_calls);
+		afs_free_call(afs_spare_incoming_call);
+		afs_spare_incoming_call = NULL;
+	}
+
+	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
 	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
+	flush_workqueue(afs_async_calls);
+	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
+	flush_workqueue(afs_async_calls);
 	sock_release(afs_socket);
 
 	_debug("dework");
 	destroy_workqueue(afs_async_calls);
-
-	ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
 	_leave("");
 }
 
 /*
- * Note that the data in a socket buffer is now consumed.
- */
-void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
-{
-	if (!skb) {
-		_debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
-		dump_stack();
-	} else {
-		_debug("DLVR %p{%u} [%d]",
-		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-		rxrpc_kernel_data_consumed(call->rxcall, skb);
-	}
-}
-
-/*
- * free a socket buffer
- */
-static void afs_free_skb(struct sk_buff *skb)
-{
-	if (!skb) {
-		_debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
-		dump_stack();
-	} else {
-		_debug("FREE %p{%u} [%d]",
-		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
-			BUG();
-		rxrpc_kernel_free_skb(skb);
-	}
-}
-
-/*
  * free a call
  */
 static void afs_free_call(struct afs_call *call)
@@ -191,7 +161,6 @@ static void afs_free_call(struct afs_call *call)
 
 	ASSERTCMP(call->rxcall, ==, NULL);
 	ASSERT(!work_pending(&call->async_work));
-	ASSERT(skb_queue_empty(&call->rx_queue));
 	ASSERT(call->type->name != NULL);
 
 	kfree(call->request);
@@ -207,7 +176,7 @@ static void afs_free_call(struct afs_call *call)
 static void afs_end_call_nofree(struct afs_call *call)
 {
 	if (call->rxcall) {
-		rxrpc_kernel_end_call(call->rxcall);
+		rxrpc_kernel_end_call(afs_socket, call->rxcall);
 		call->rxcall = NULL;
 	}
 	if (call->type->destructor)
@@ -227,7 +196,7 @@ static void afs_end_call(struct afs_call *call)
  * allocate a call with flat request and reply buffers
  */
 struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
-				     size_t request_size, size_t reply_size)
+				     size_t request_size, size_t reply_max)
 {
 	struct afs_call *call;
 
@@ -241,7 +210,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
 
 	call->type = type;
 	call->request_size = request_size;
-	call->reply_max = reply_size;
+	call->reply_max = reply_max;
 
 	if (request_size) {
 		call->request = kmalloc(request_size, GFP_NOFS);
@@ -249,14 +218,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
 			goto nomem_free;
 	}
 
-	if (reply_size) {
-		call->buffer = kmalloc(reply_size, GFP_NOFS);
+	if (reply_max) {
+		call->buffer = kmalloc(reply_max, GFP_NOFS);
 		if (!call->buffer)
 			goto nomem_free;
 	}
 
 	init_waitqueue_head(&call->waitq);
-	skb_queue_head_init(&call->rx_queue);
 	return call;
 
 nomem_free:
@@ -325,8 +293,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
 			 * returns from sending the request */
 			if (first + loop >= last)
 				call->state = AFS_CALL_AWAIT_REPLY;
-			ret = rxrpc_kernel_send_data(call->rxcall, msg,
-						     to - offset);
+			ret = rxrpc_kernel_send_data(afs_socket, call->rxcall,
+						     msg, to - offset);
 			kunmap(pages[loop]);
 			if (ret < 0)
 				break;
@@ -354,7 +322,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	struct msghdr msg;
 	struct kvec iov[1];
 	int ret;
-	struct sk_buff *skb;
 
 	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
 
@@ -366,8 +333,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	       atomic_read(&afs_outstanding_calls));
 
 	call->wait_mode = wait_mode;
-	call->async_workfn = afs_process_async_call;
-	INIT_WORK(&call->async_work, afs_async_workfn);
+	INIT_WORK(&call->async_work, afs_process_async_call);
 
 	memset(&srx, 0, sizeof(srx));
 	srx.srx_family = AF_RXRPC;
@@ -380,7 +346,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 
 	/* create a call */
 	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
-					 (unsigned long) call, gfp);
+					 (unsigned long) call, gfp,
+					 wait_mode->notify_rx);
 	call->key = NULL;
 	if (IS_ERR(rxcall)) {
 		ret = PTR_ERR(rxcall);
@@ -406,7 +373,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	 * request */
 	if (!call->send_pages)
 		call->state = AFS_CALL_AWAIT_REPLY;
-	ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+	ret = rxrpc_kernel_send_data(afs_socket, rxcall,
+				     &msg, call->request_size);
 	if (ret < 0)
 		goto error_do_abort;
 
@@ -421,9 +389,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	return wait_mode->wait(call);
 
 error_do_abort:
-	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
-	while ((skb = skb_dequeue(&call->rx_queue)))
-		afs_free_skb(skb);
+	rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD");
 error_kill_call:
 	afs_end_call(call);
 	_leave(" = %d", ret);
@@ -431,140 +397,77 @@ error_kill_call:
 }
 
 /*
- * Handles intercepted messages that were arriving in the socket's Rx queue.
- *
- * Called from the AF_RXRPC call processor in waitqueue process context.  For
- * each call, it is guaranteed this will be called in order of packet to be
- * delivered.
- */
-static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
-			       struct sk_buff *skb)
-{
-	struct afs_call *call = (struct afs_call *) user_call_ID;
-
-	_enter("%p,,%u", call, skb->mark);
-
-	_debug("ICPT %p{%u} [%d]",
-	       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-
-	ASSERTCMP(sk, ==, afs_socket->sk);
-	atomic_inc(&afs_outstanding_skbs);
-
-	if (!call) {
-		/* its an incoming call for our callback service */
-		skb_queue_tail(&afs_incoming_calls, skb);
-		queue_work(afs_wq, &afs_collect_incoming_call_work);
-	} else {
-		/* route the messages directly to the appropriate call */
-		skb_queue_tail(&call->rx_queue, skb);
-		call->wait_mode->rx_wakeup(call);
-	}
-
-	_leave("");
-}
-
-/*
  * deliver messages to a call
  */
 static void afs_deliver_to_call(struct afs_call *call)
 {
-	struct sk_buff *skb;
-	bool last;
 	u32 abort_code;
 	int ret;
 
-	_enter("");
-
-	while ((call->state == AFS_CALL_AWAIT_REPLY ||
-		call->state == AFS_CALL_AWAIT_OP_ID ||
-		call->state == AFS_CALL_AWAIT_REQUEST ||
-		call->state == AFS_CALL_AWAIT_ACK) &&
-	       (skb = skb_dequeue(&call->rx_queue))) {
-		switch (skb->mark) {
-		case RXRPC_SKB_MARK_DATA:
-			_debug("Rcv DATA");
-			last = rxrpc_kernel_is_data_last(skb);
-			ret = call->type->deliver(call, skb, last);
-			switch (ret) {
-			case -EAGAIN:
-				if (last) {
-					_debug("short data");
-					goto unmarshal_error;
-				}
-				break;
-			case 0:
-				ASSERT(last);
-				if (call->state == AFS_CALL_AWAIT_REPLY)
-					call->state = AFS_CALL_COMPLETE;
-				break;
-			case -ENOTCONN:
-				abort_code = RX_CALL_DEAD;
-				goto do_abort;
-			case -ENOTSUPP:
-				abort_code = RX_INVALID_OPERATION;
-				goto do_abort;
-			default:
-			unmarshal_error:
-				abort_code = RXGEN_CC_UNMARSHAL;
-				if (call->state != AFS_CALL_AWAIT_REPLY)
-					abort_code = RXGEN_SS_UNMARSHAL;
-			do_abort:
-				rxrpc_kernel_abort_call(call->rxcall,
-							abort_code);
-				call->error = ret;
-				call->state = AFS_CALL_ERROR;
-				break;
+	_enter("%s", call->type->name);
+
+	while (call->state == AFS_CALL_AWAIT_REPLY ||
+	       call->state == AFS_CALL_AWAIT_OP_ID ||
+	       call->state == AFS_CALL_AWAIT_REQUEST ||
+	       call->state == AFS_CALL_AWAIT_ACK
+	       ) {
+		if (call->state == AFS_CALL_AWAIT_ACK) {
+			size_t offset = 0;
+			ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+						     NULL, 0, &offset, false,
+						     &call->abort_code);
+			if (ret == -EINPROGRESS || ret == -EAGAIN)
+				return;
+			if (ret == 1) {
+				call->state = AFS_CALL_COMPLETE;
+				goto done;
 			}
-			break;
-		case RXRPC_SKB_MARK_FINAL_ACK:
-			_debug("Rcv ACK");
-			call->state = AFS_CALL_COMPLETE;
-			break;
-		case RXRPC_SKB_MARK_BUSY:
-			_debug("Rcv BUSY");
-			call->error = -EBUSY;
-			call->state = AFS_CALL_BUSY;
-			break;
-		case RXRPC_SKB_MARK_REMOTE_ABORT:
-			abort_code = rxrpc_kernel_get_abort_code(skb);
-			call->error = call->type->abort_to_error(abort_code);
-			call->state = AFS_CALL_ABORTED;
-			_debug("Rcv ABORT %u -> %d", abort_code, call->error);
-			break;
-		case RXRPC_SKB_MARK_LOCAL_ABORT:
-			abort_code = rxrpc_kernel_get_abort_code(skb);
-			call->error = call->type->abort_to_error(abort_code);
-			call->state = AFS_CALL_ABORTED;
-			_debug("Loc ABORT %u -> %d", abort_code, call->error);
-			break;
-		case RXRPC_SKB_MARK_NET_ERROR:
-			call->error = -rxrpc_kernel_get_error_number(skb);
-			call->state = AFS_CALL_ERROR;
-			_debug("Rcv NET ERROR %d", call->error);
-			break;
-		case RXRPC_SKB_MARK_LOCAL_ERROR:
-			call->error = -rxrpc_kernel_get_error_number(skb);
-			call->state = AFS_CALL_ERROR;
-			_debug("Rcv LOCAL ERROR %d", call->error);
-			break;
-		default:
-			BUG();
-			break;
+			return;
 		}
 
-		afs_free_skb(skb);
-	}
-
-	/* make sure the queue is empty if the call is done with (we might have
-	 * aborted the call early because of an unmarshalling error) */
-	if (call->state >= AFS_CALL_COMPLETE) {
-		while ((skb = skb_dequeue(&call->rx_queue)))
-			afs_free_skb(skb);
-		if (call->incoming)
-			afs_end_call(call);
+		ret = call->type->deliver(call);
+		switch (ret) {
+		case 0:
+			if (call->state == AFS_CALL_AWAIT_REPLY)
+				call->state = AFS_CALL_COMPLETE;
+			goto done;
+		case -EINPROGRESS:
+		case -EAGAIN:
+			goto out;
+		case -ENOTCONN:
+			abort_code = RX_CALL_DEAD;
+			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+						abort_code, -ret, "KNC");
+			goto do_abort;
+		case -ENOTSUPP:
+			abort_code = RX_INVALID_OPERATION;
+			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+						abort_code, -ret, "KIV");
+			goto do_abort;
+		case -ENODATA:
+		case -EBADMSG:
+		case -EMSGSIZE:
+		default:
+			abort_code = RXGEN_CC_UNMARSHAL;
+			if (call->state != AFS_CALL_AWAIT_REPLY)
+				abort_code = RXGEN_SS_UNMARSHAL;
+			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+						abort_code, EBADMSG, "KUM");
+			goto do_abort;
+		}
 	}
 
+done:
+	if (call->state == AFS_CALL_COMPLETE && call->incoming)
+		afs_end_call(call);
+out:
 	_leave("");
+	return;
+
+do_abort:
+	call->error = ret;
+	call->state = AFS_CALL_COMPLETE;
+	goto done;
 }
 
 /*
@@ -572,7 +475,7 @@ static void afs_deliver_to_call(struct afs_call *call)
  */
 static int afs_wait_for_call_to_complete(struct afs_call *call)
 {
-	struct sk_buff *skb;
+	const char *abort_why;
 	int ret;
 
 	DECLARE_WAITQUEUE(myself, current);
@@ -584,15 +487,18 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		/* deliver any messages that are in the queue */
-		if (!skb_queue_empty(&call->rx_queue)) {
+		if (call->state < AFS_CALL_COMPLETE && call->need_attention) {
+			call->need_attention = false;
 			__set_current_state(TASK_RUNNING);
 			afs_deliver_to_call(call);
 			continue;
 		}
 
+		abort_why = "KWC";
 		ret = call->error;
-		if (call->state >= AFS_CALL_COMPLETE)
+		if (call->state == AFS_CALL_COMPLETE)
 			break;
+		abort_why = "KWI";
 		ret = -EINTR;
 		if (signal_pending(current))
 			break;
@@ -605,9 +511,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	/* kill the call */
 	if (call->state < AFS_CALL_COMPLETE) {
 		_debug("call incomplete");
-		rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
-		while ((skb = skb_dequeue(&call->rx_queue)))
-			afs_free_skb(skb);
+		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+					RX_CALL_DEAD, -ret, abort_why);
 	}
 
 	_debug("call complete");
@@ -619,17 +524,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 /*
  * wake up a waiting call
  */
-static void afs_wake_up_call_waiter(struct afs_call *call)
+static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall,
+				    unsigned long call_user_ID)
 {
+	struct afs_call *call = (struct afs_call *)call_user_ID;
+
+	call->need_attention = true;
 	wake_up(&call->waitq);
 }
 
 /*
  * wake up an asynchronous call
  */
-static void afs_wake_up_async_call(struct afs_call *call)
+static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
+				   unsigned long call_user_ID)
 {
-	_enter("");
+	struct afs_call *call = (struct afs_call *)call_user_ID;
+
+	call->need_attention = true;
 	queue_work(afs_async_calls, &call->async_work);
 }
 
@@ -647,8 +559,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
 /*
  * delete an asynchronous call
  */
-static void afs_delete_async_call(struct afs_call *call)
+static void afs_delete_async_call(struct work_struct *work)
 {
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
 	_enter("");
 
 	afs_free_call(call);
@@ -658,17 +572,19 @@ static void afs_delete_async_call(struct afs_call *call)
 
 /*
  * perform processing on an asynchronous call
- * - on a multiple-thread workqueue this work item may try to run on several
- *   CPUs at the same time
  */
-static void afs_process_async_call(struct afs_call *call)
+static void afs_process_async_call(struct work_struct *work)
 {
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
 	_enter("");
 
-	if (!skb_queue_empty(&call->rx_queue))
+	if (call->state < AFS_CALL_COMPLETE && call->need_attention) {
+		call->need_attention = false;
 		afs_deliver_to_call(call);
+	}
 
-	if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+	if (call->state == AFS_CALL_COMPLETE && call->wait_mode) {
 		if (call->wait_mode->async_complete)
 			call->wait_mode->async_complete(call->reply,
 							call->error);
@@ -679,122 +595,93 @@ static void afs_process_async_call(struct afs_call *call)
 
 		/* we can't just delete the call because the work item may be
 		 * queued */
-		call->async_workfn = afs_delete_async_call;
+		call->async_work.func = afs_delete_async_call;
 		queue_work(afs_async_calls, &call->async_work);
 	}
 
 	_leave("");
 }
 
-/*
- * Empty a socket buffer into a flat reply buffer.
- */
-int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
 {
-	size_t len = skb->len;
-
-	if (len > call->reply_max - call->reply_size) {
-		_leave(" = -EBADMSG [%zu > %u]",
-		       len, call->reply_max - call->reply_size);
-		return -EBADMSG;
-	}
+	struct afs_call *call = (struct afs_call *)user_call_ID;
 
-	if (len > 0) {
-		if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
-				  len) < 0)
-			BUG();
-		call->reply_size += len;
-	}
-
-	afs_data_consumed(call, skb);
-	if (!last)
-		return -EAGAIN;
-
-	if (call->reply_size != call->reply_max) {
-		_leave(" = -EBADMSG [%u != %u]",
-		       call->reply_size, call->reply_max);
-		return -EBADMSG;
-	}
-	return 0;
+	call->rxcall = rxcall;
 }
 
 /*
- * accept the backlog of incoming calls
+ * Charge the incoming call preallocation.
  */
-static void afs_collect_incoming_call(struct work_struct *work)
+static void afs_charge_preallocation(struct work_struct *work)
 {
-	struct rxrpc_call *rxcall;
-	struct afs_call *call = NULL;
-	struct sk_buff *skb;
-
-	while ((skb = skb_dequeue(&afs_incoming_calls))) {
-		_debug("new call");
-
-		/* don't need the notification */
-		afs_free_skb(skb);
+	struct afs_call *call = afs_spare_incoming_call;
 
+	for (;;) {
 		if (!call) {
 			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
-			if (!call) {
-				rxrpc_kernel_reject_call(afs_socket);
-				return;
-			}
+			if (!call)
+				break;
 
-			call->async_workfn = afs_process_async_call;
-			INIT_WORK(&call->async_work, afs_async_workfn);
+			INIT_WORK(&call->async_work, afs_process_async_call);
 			call->wait_mode = &afs_async_incoming_call;
 			call->type = &afs_RXCMxxxx;
 			init_waitqueue_head(&call->waitq);
-			skb_queue_head_init(&call->rx_queue);
 			call->state = AFS_CALL_AWAIT_OP_ID;
-
-			_debug("CALL %p{%s} [%d]",
-			       call, call->type->name,
-			       atomic_read(&afs_outstanding_calls));
-			atomic_inc(&afs_outstanding_calls);
 		}
 
-		rxcall = rxrpc_kernel_accept_call(afs_socket,
-						  (unsigned long) call);
-		if (!IS_ERR(rxcall)) {
-			call->rxcall = rxcall;
-			call = NULL;
-		}
+		if (rxrpc_kernel_charge_accept(afs_socket,
+					       afs_wake_up_async_call,
+					       afs_rx_attach,
+					       (unsigned long)call,
+					       GFP_KERNEL) < 0)
+			break;
+		call = NULL;
 	}
+	afs_spare_incoming_call = call;
+}
+
+/*
+ * Discard a preallocated call when a socket is shut down.
+ */
+static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
+				    unsigned long user_call_ID)
+{
+	struct afs_call *call = (struct afs_call *)user_call_ID;
 
-	if (call)
-		afs_free_call(call);
+	atomic_inc(&afs_outstanding_calls);
+	call->rxcall = NULL;
+	afs_free_call(call);
+}
+
+/*
+ * Notification of an incoming call.
+ */
+static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
+			    unsigned long user_call_ID)
+{
+	atomic_inc(&afs_outstanding_calls);
+	queue_work(afs_wq, &afs_charge_preallocation_work);
 }
 
 /*
  * Grab the operation ID from an incoming cache manager call.  The socket
  * buffer is discarded on error or if we don't yet have sufficient data.
  */
-static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
-				bool last)
+static int afs_deliver_cm_op_id(struct afs_call *call)
 {
-	size_t len = skb->len;
-	void *oibuf = (void *) &call->operation_ID;
+	int ret;
 
-	_enter("{%u},{%zu},%d", call->offset, len, last);
+	_enter("{%zu}", call->offset);
 
 	ASSERTCMP(call->offset, <, 4);
 
 	/* the operation ID forms the first four bytes of the request data */
-	len = min_t(size_t, len, 4 - call->offset);
-	if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
-		BUG();
-	if (!pskb_pull(skb, len))
-		BUG();
-	call->offset += len;
-
-	if (call->offset < 4) {
-		afs_data_consumed(call, skb);
-		_leave(" = -EAGAIN");
-		return -EAGAIN;
-	}
+	ret = afs_extract_data(call, &call->operation_ID, 4, true);
+	if (ret < 0)
+		return ret;
 
 	call->state = AFS_CALL_AWAIT_REQUEST;
+	call->offset = 0;
 
 	/* ask the cache manager to route the call (it'll change the call type
 	 * if successful) */
@@ -803,7 +690,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
 
 	/* pass responsibility for the remainer of this message off to the
 	 * cache manager op */
-	return call->type->deliver(call, skb, last);
+	return call->type->deliver(call);
 }
 
 /*
@@ -823,14 +710,15 @@ void afs_send_empty_reply(struct afs_call *call)
 	msg.msg_flags		= 0;
 
 	call->state = AFS_CALL_AWAIT_ACK;
-	switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+	switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) {
 	case 0:
 		_leave(" [replied]");
 		return;
 
 	case -ENOMEM:
 		_debug("oom");
-		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+					RX_USER_ABORT, ENOMEM, "KOO");
 	default:
 		afs_end_call(call);
 		_leave(" [error]");
@@ -859,7 +747,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	msg.msg_flags		= 0;
 
 	call->state = AFS_CALL_AWAIT_ACK;
-	n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
+	n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len);
 	if (n >= 0) {
 		/* Success */
 		_leave(" [replied]");
@@ -868,7 +756,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 
 	if (n == -ENOMEM) {
 		_debug("oom");
-		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+					RX_USER_ABORT, ENOMEM, "KOO");
 	}
 	afs_end_call(call);
 	_leave(" [error]");
@@ -877,25 +766,40 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 /*
  * Extract a piece of data from the received data socket buffers.
  */
-int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
-		     bool last, void *buf, size_t count)
+int afs_extract_data(struct afs_call *call, void *buf, size_t count,
+		     bool want_more)
 {
-	size_t len = skb->len;
+	int ret;
 
-	_enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+	_enter("{%s,%zu},,%zu,%d",
+	       call->type->name, call->offset, count, want_more);
 
-	ASSERTCMP(call->offset, <, count);
+	ASSERTCMP(call->offset, <=, count);
 
-	len = min_t(size_t, len, count - call->offset);
-	if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
-	    !pskb_pull(skb, len))
-		BUG();
-	call->offset += len;
+	ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+				     buf, count, &call->offset,
+				     want_more, &call->abort_code);
+	if (ret == 0 || ret == -EAGAIN)
+		return ret;
 
-	if (call->offset < count) {
-		afs_data_consumed(call, skb);
-		_leave(" = -EAGAIN");
-		return -EAGAIN;
+	if (ret == 1) {
+		switch (call->state) {
+		case AFS_CALL_AWAIT_REPLY:
+			call->state = AFS_CALL_COMPLETE;
+			break;
+		case AFS_CALL_AWAIT_REQUEST:
+			call->state = AFS_CALL_REPLYING;
+			break;
+		default:
+			break;
+		}
+		return 0;
 	}
-	return 0;
+
+	if (ret == -ECONNABORTED)
+		call->error = call->type->abort_to_error(call->abort_code);
+	else
+		call->error = ret;
+	call->state = AFS_CALL_COMPLETE;
+	return ret;
 }
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f342acf3547d..d4066ab7dd55 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -178,13 +178,18 @@ server_in_two_cells:
 /*
  * look up a server by its IP address
  */
-struct afs_server *afs_find_server(const struct in_addr *_addr)
+struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
 {
 	struct afs_server *server = NULL;
 	struct rb_node *p;
-	struct in_addr addr = *_addr;
+	struct in_addr addr = srx->transport.sin.sin_addr;
 
-	_enter("%pI4", &addr.s_addr);
+	_enter("{%d,%pI4}", srx->transport.family, &addr.s_addr);
+
+	if (srx->transport.family != AF_INET) {
+		WARN(true, "AFS does not yes support non-IPv4 addresses\n");
+		return NULL;
+	}
 
 	read_lock(&afs_servers_lock);
 
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index f94d1abdc3eb..94bcd97d22b8 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -58,17 +58,16 @@ static int afs_vl_abort_to_error(u32 abort_code)
 /*
  * deliver reply data to a VL.GetEntryByXXX call
  */
-static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
-					   struct sk_buff *skb, bool last)
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call)
 {
 	struct afs_cache_vlocation *entry;
 	__be32 *bp;
 	u32 tmp;
 	int loop, ret;
 
-	_enter(",,%u", last);
+	_enter("");
 
-	ret = afs_transfer_reply(call, skb, last);
+	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 52976785a32c..45a86396fd2d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -594,8 +594,8 @@ static void afs_vlocation_reaper(struct work_struct *work)
  */
 int __init afs_vlocation_update_init(void)
 {
-	afs_vlocation_update_worker =
-		create_singlethread_workqueue("kafs_vlupdated");
+	afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated",
+						      WQ_MEM_RECLAIM, 0);
 	return afs_vlocation_update_worker ? 0 : -ENOMEM;
 }
 
diff --git a/fs/aio.c b/fs/aio.c
index fb8e45b88cd4..4fe81d1c60f9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
 	static const struct dentry_operations ops = {
 		.d_dname	= simple_dname,
 	};
-	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
+	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops,
+					   AIO_RING_MAGIC);
+
+	if (!IS_ERR(root))
+		root->d_sb->s_iflags |= SB_I_NOEXEC;
+	return root;
 }
 
 /* aio_setup
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909e7492..d8e6d421c27f 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 	}
 	return NULL;
 }
+
 /*
  * Find an eligible tree to time-out
  * A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	struct dentry *root = sb->s_root;
 	struct dentry *dentry;
 	struct dentry *expired;
+	struct dentry *found;
 	struct autofs_info *ino;
 
 	if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 
 	dentry = NULL;
 	while ((dentry = get_next_positive_subdir(dentry, root))) {
+		int flags = how;
+
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
-		if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
-			expired = NULL;
-		else
-			expired = should_expire(dentry, mnt, timeout, how);
-		if (!expired) {
+		if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
 			spin_unlock(&sbi->fs_lock);
 			continue;
 		}
+		spin_unlock(&sbi->fs_lock);
+
+		expired = should_expire(dentry, mnt, timeout, flags);
+		if (!expired)
+			continue;
+
+		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(expired);
 		ino->flags |= AUTOFS_INF_WANT_EXPIRE;
 		spin_unlock(&sbi->fs_lock);
 		synchronize_rcu();
-		spin_lock(&sbi->fs_lock);
-		if (should_expire(expired, mnt, timeout, how)) {
-			if (expired != dentry)
-				dput(dentry);
-			goto found;
-		}
 
+		/* Make sure a reference is not taken on found if
+		 * things have changed.
+		 */
+		flags &= ~AUTOFS_EXP_LEAVES;
+		found = should_expire(expired, mnt, timeout, how);
+		if (!found || found != expired)
+			/* Something has changed, continue */
+			goto next;
+
+		if (expired != dentry)
+			dput(dentry);
+
+		spin_lock(&sbi->fs_lock);
+		goto found;
+next:
+		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+		spin_unlock(&sbi->fs_lock);
 		if (expired != dentry)
 			dput(expired);
-		spin_unlock(&sbi->fs_lock);
 	}
 	return NULL;
 
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	int status;
+	int state;
 
 	/* Block on any pending expire */
 	if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
 	if (rcu_walk)
 		return -ECHILD;
 
+retry:
 	spin_lock(&sbi->fs_lock);
-	if (ino->flags & AUTOFS_INF_EXPIRING) {
+	state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
+	if (state == AUTOFS_INF_WANT_EXPIRE) {
+		spin_unlock(&sbi->fs_lock);
+		/*
+		 * Possibly being selected for expire, wait until
+		 * it's selected or not.
+		 */
+		schedule_timeout_uninterruptible(HZ/10);
+		goto retry;
+	}
+	if (state & AUTOFS_INF_EXPIRING) {
 		spin_unlock(&sbi->fs_lock);
 
 		pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 431fd7ee3488..e44271dfceb6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_uid();
-		wq->gid = current_gid();
+		wq->uid = current_real_cred()->uid;
+		wq->gid = current_real_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e5495f37c6ed..2472af2798c7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task,
 		regset->writeback(task, regset, 1);
 }
 
-#ifndef PR_REG_SIZE
-#define PR_REG_SIZE(S) sizeof(S)
-#endif
-
 #ifndef PRSTATUS_SIZE
-#define PRSTATUS_SIZE(S) sizeof(S)
-#endif
-
-#ifndef PR_REG_PTR
-#define PR_REG_PTR(S) (&((S)->pr_reg))
+#define PRSTATUS_SIZE(S, R) sizeof(S)
 #endif
 
 #ifndef SET_PR_FPVALID
-#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V))
 #endif
 
 static int fill_thread_core_info(struct elf_thread_core_info *t,
@@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
+	unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0],
-				    0, PR_REG_SIZE(t->prstatus.pr_reg),
-				    PR_REG_PTR(&t->prstatus), NULL);
+	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+				    &t->prstatus.pr_reg, NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  PRSTATUS_SIZE(t->prstatus), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  regset->core_note_type,
 						  size, data);
 				else {
-					SET_PR_FPVALID(&t->prstatus, 1);
+					SET_PR_FPVALID(&t->prstatus,
+							1, regset_size);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eff3993c77b3..33fe03551105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -427,6 +427,7 @@ struct btrfs_space_info {
 	struct list_head ro_bgs;
 	struct list_head priority_tickets;
 	struct list_head tickets;
+	u64 tickets_id;
 
 	struct rw_semaphore groups_sem;
 	/* for block groups in our same type */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8c8a4d1e02b9..665da8f66ff1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4271,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 	if (ret < 0)
 		return ret;
 
-	/*
-	 * Use new btrfs_qgroup_reserve_data to reserve precious data space
-	 *
-	 * TODO: Find a good method to avoid reserve data space for NOCOW
-	 * range, but don't impact performance on quota disable case.
-	 */
+	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
 	ret = btrfs_qgroup_reserve_data(inode, start, len);
+	if (ret)
+		btrfs_free_reserved_data_space_noquota(inode, start, len);
 	return ret;
 }
 
@@ -4966,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head)
  */
 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 {
-	struct reserve_ticket *last_ticket = NULL;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_space_info *space_info;
 	u64 to_reclaim;
 	int flush_state;
 	int commit_cycles = 0;
+	u64 last_tickets_id;
 
 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
@@ -4984,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		spin_unlock(&space_info->lock);
 		return;
 	}
-	last_ticket = list_first_entry(&space_info->tickets,
-				       struct reserve_ticket, list);
+	last_tickets_id = space_info->tickets_id;
 	spin_unlock(&space_info->lock);
 
 	flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -5005,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 							      space_info);
 		ticket = list_first_entry(&space_info->tickets,
 					  struct reserve_ticket, list);
-		if (last_ticket == ticket) {
+		if (last_tickets_id == space_info->tickets_id) {
 			flush_state++;
 		} else {
-			last_ticket = ticket;
+			last_tickets_id = space_info->tickets_id;
 			flush_state = FLUSH_DELAYED_ITEMS_NR;
 			if (commit_cycles)
 				commit_cycles--;
@@ -5384,6 +5380,7 @@ again:
 			list_del_init(&ticket->list);
 			num_bytes -= ticket->bytes;
 			ticket->bytes = 0;
+			space_info->tickets_id++;
 			wake_up(&ticket->wait);
 		} else {
 			ticket->bytes -= num_bytes;
@@ -5426,6 +5423,7 @@ again:
 			num_bytes -= ticket->bytes;
 			space_info->bytes_may_use += ticket->bytes;
 			ticket->bytes = 0;
+			space_info->tickets_id++;
 			wake_up(&ticket->wait);
 		} else {
 			trace_btrfs_space_reservation(fs_info, "space_info",
@@ -8216,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
 
 	/*
 	 * Mixed block groups will exclude before processing the log so we only
@@ -8231,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	if (!block_group)
 		return -EINVAL;
 
-	ret = btrfs_add_reserved_bytes(block_group, ins->offset,
-				       ins->offset, 0);
-	BUG_ON(ret); /* logic error */
+	space_info = block_group->space_info;
+	spin_lock(&space_info->lock);
+	spin_lock(&block_group->lock);
+	space_info->bytes_reserved += ins->offset;
+	block_group->reserved += ins->offset;
+	spin_unlock(&block_group->lock);
+	spin_unlock(&space_info->lock);
+
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
 	btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2a2da5893af..7fd939bfbd99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 	int namelen;
 	int ret = 0;
 
+	if (!S_ISDIR(file_inode(file)->i_mode))
+		return -ENOTDIR;
+
 	ret = mnt_want_write_file(file);
 	if (ret)
 		goto out;
@@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
+	if (!S_ISDIR(file_inode(file)->i_mode))
+		return -ENOTDIR;
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
 		return PTR_ERR(vol_args);
@@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	bool readonly = false;
 	struct btrfs_qgroup_inherit *inherit = NULL;
 
+	if (!S_ISDIR(file_inode(file)->i_mode))
+		return -ENOTDIR;
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
 		return PTR_ERR(vol_args);
@@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	int ret;
 	int err = 0;
 
+	if (!S_ISDIR(dir->i_mode))
+		return -ENOTDIR;
+
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
 		return PTR_ERR(vol_args);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e935035ac034..ef9c55bc7907 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2867,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
 		blk_finish_plug(&plug);
+		list_del_init(&root_log_ctx.list);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = root_log_ctx.log_ret;
 		goto out;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c64a0b794d49..df4b3e6fa563 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
 	if (is_hash_order(new_pos)) {
 		/* no need to reset last_name for a forward seek when
 		 * dentries are sotred in hash order */
-	} else if (fi->frag |= fpos_frag(new_pos)) {
+	} else if (fi->frag != fpos_frag(new_pos)) {
 		return true;
 	}
 	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6bbec5e784cd..14ae4b8e1a3c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 	char *s, *p;
 	char sep;
 
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+		return dget(sb->s_root);
+
 	full_path = cifs_build_path_to_root(vol, cifs_sb,
 					    cifs_sb_master_tcon(cifs_sb));
 	if (full_path == NULL)
@@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type,
 	cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
 	if (cifs_sb->mountdata == NULL) {
 		root = ERR_PTR(-ENOMEM);
-		goto out_cifs_sb;
+		goto out_free;
 	}
 
-	if (volume_info->prepath) {
-		cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL);
-		if (cifs_sb->prepath == NULL) {
-			root = ERR_PTR(-ENOMEM);
-			goto out_cifs_sb;
-		}
+	rc = cifs_setup_cifs_sb(volume_info, cifs_sb);
+	if (rc) {
+		root = ERR_PTR(rc);
+		goto out_free;
 	}
 
-	cifs_setup_cifs_sb(volume_info, cifs_sb);
-
 	rc = cifs_mount(cifs_sb, volume_info);
 	if (rc) {
 		if (!(flags & MS_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
-		goto out_mountdata;
+		goto out_free;
 	}
 
 	mnt_data.vol = volume_info;
@@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
-		root = dget(sb->s_root);
-	else
-		root = cifs_get_root(volume_info, sb);
-
+	root = cifs_get_root(volume_info, sb);
 	if (IS_ERR(root))
 		goto out_super;
 
@@ -752,9 +747,9 @@ out:
 	cifs_cleanup_volume_info(volume_info);
 	return root;
 
-out_mountdata:
+out_free:
+	kfree(cifs_sb->prepath);
 	kfree(cifs_sb->mountdata);
-out_cifs_sb:
 	kfree(cifs_sb);
 out_nls:
 	unload_nls(volume_info->local_nls);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1243bd326591..95dab43646f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
 			         unsigned int to_read);
 extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
 				      struct page *page, unsigned int to_read);
-extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 			       struct cifs_sb_info *cifs_sb);
 extern int cifs_match_super(struct super_block *, void *);
 extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae03283bd61..2e4f4bad8b1e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 	return 1;
 }
 
+static int
+match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+	struct cifs_sb_info *old = CIFS_SB(sb);
+	struct cifs_sb_info *new = mnt_data->cifs_sb;
+
+	if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
+		if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
+			return 0;
+		/* The prepath should be null terminated strings */
+		if (strcmp(new->prepath, old->prepath))
+			return 0;
+
+		return 1;
+	}
+	return 0;
+}
+
 int
 cifs_match_super(struct super_block *sb, void *data)
 {
@@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data)
 
 	if (!match_server(tcp_srv, volume_info) ||
 	    !match_session(ses, volume_info) ||
-	    !match_tcon(tcon, volume_info->UNC)) {
+	    !match_tcon(tcon, volume_info->UNC) ||
+	    !match_prepath(sb, mnt_data)) {
 		rc = 0;
 		goto out;
 	}
@@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
 	}
 }
 
-void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 			struct cifs_sb_info *cifs_sb)
 {
 	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
@@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
 		cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
+
+	if (pvolume_info->prepath) {
+		cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL);
+		if (cifs_sb->prepath == NULL)
+			return -ENOMEM;
+	}
+
+	return 0;
 }
 
 static void
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c30cf49b69d2..2c6312db8516 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
 		if (bin_attr->cb_max_size &&
 			*ppos + count > bin_attr->cb_max_size) {
 			len = -EFBIG;
+			goto out;
 		}
 
 		tbuf = vmalloc(*ppos + count);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 0f9961eede1e..ed115acb5dee 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -11,6 +11,7 @@
 #include <linux/random.h>
 #include <linux/string.h>
 #include <linux/fscrypto.h>
+#include <linux/mount.h>
 
 static int inode_has_encryption_context(struct inode *inode)
 {
@@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode,
 	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
 }
 
-int fscrypt_process_policy(struct inode *inode,
+int fscrypt_process_policy(struct file *filp,
 				const struct fscrypt_policy *policy)
 {
+	struct inode *inode = file_inode(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
 	if (policy->version != 0)
 		return -EINVAL;
 
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
 	if (!inode_has_encryption_context(inode)) {
-		if (!inode->i_sb->s_cop->empty_dir)
-			return -EOPNOTSUPP;
-		if (!inode->i_sb->s_cop->empty_dir(inode))
-			return -ENOTEMPTY;
-		return create_encryption_context_from_policy(inode, policy);
+		if (!S_ISDIR(inode->i_mode))
+			ret = -EINVAL;
+		else if (!inode->i_sb->s_cop->empty_dir)
+			ret = -EOPNOTSUPP;
+		else if (!inode->i_sb->s_cop->empty_dir(inode))
+			ret = -ENOTEMPTY;
+		else
+			ret = create_encryption_context_from_policy(inode,
+								    policy);
+	} else if (!is_encryption_context_consistent_with_policy(inode,
+								 policy)) {
+		printk(KERN_WARNING
+		       "%s: Policy inconsistent with encryption context\n",
+		       __func__);
+		ret = -EINVAL;
 	}
 
-	if (is_encryption_context_consistent_with_policy(inode, policy))
-		return 0;
-
-	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
-	       __func__);
-	return -EINVAL;
+	mnt_drop_write_file(filp);
+	return ret;
 }
 EXPORT_SYMBOL(fscrypt_process_policy);
 
diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..cc025f82ef07 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/iomap.h>
+#include "internal.h"
 
 /*
  * We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 	return VM_FAULT_LOCKED;
 }
 
-static int copy_user_bh(struct page *to, struct inode *inode,
-		struct buffer_head *bh, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
+		struct page *to, unsigned long vaddr)
 {
 	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
-		.size = bh->b_size,
+		.sector = sector,
+		.size = size,
 	};
-	struct block_device *bdev = bh->b_bdev;
 	void *vto;
 
 	if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_insert_mapping(struct address_space *mapping,
-			struct buffer_head *bh, void **entryp,
-			struct vm_area_struct *vma, struct vm_fault *vmf)
+		struct block_device *bdev, sector_t sector, size_t size,
+		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	struct block_device *bdev = bh->b_bdev;
 	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, mapping->host),
-		.size = bh->b_size,
+		.sector = sector,
+		.size = size,
 	};
 	void *ret;
 	void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (vmf->cow_page) {
 		struct page *new_page = vmf->cow_page;
 		if (buffer_written(&bh))
-			error = copy_user_bh(new_page, inode, &bh, vaddr);
+			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
+					bh.b_size, new_page, vaddr);
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	/* Filesystem should not return unwritten buffers to us! */
 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
+			bh.b_size, &entry, vma, vmf);
  unlock_entry:
 	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 	return dax_zero_page_range(inode, from, length, get_block);
 }
 EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+#ifdef CONFIG_FS_IOMAP
+static loff_t
+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iov_iter *iter = data;
+	loff_t end = pos + length, done = 0;
+	ssize_t ret = 0;
+
+	if (iov_iter_rw(iter) == READ) {
+		end = min(end, i_size_read(inode));
+		if (pos >= end)
+			return 0;
+
+		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+			return iov_iter_zero(min(length, end - pos), iter);
+	}
+
+	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+		return -EIO;
+
+	while (pos < end) {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		struct blk_dax_ctl dax = { 0 };
+		ssize_t map_len;
+
+		dax.sector = iomap->blkno +
+			(((pos & PAGE_MASK) - iomap->offset) >> 9);
+		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
+		map_len = dax_map_atomic(iomap->bdev, &dax);
+		if (map_len < 0) {
+			ret = map_len;
+			break;
+		}
+
+		dax.addr += offset;
+		map_len -= offset;
+		if (map_len > end - pos)
+			map_len = end - pos;
+
+		if (iov_iter_rw(iter) == WRITE)
+			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+		else
+			map_len = copy_to_iter(dax.addr, map_len, iter);
+		dax_unmap_atomic(iomap->bdev, &dax);
+		if (map_len <= 0) {
+			ret = map_len ? map_len : -EFAULT;
+			break;
+		}
+
+		pos += map_len;
+		length -= map_len;
+		done += map_len;
+	}
+
+	return done ? done : ret;
+}
+
+/**
+ * iomap_dax_rw - Perform I/O to a DAX file
+ * @iocb:	The control block for this I/O
+ * @iter:	The addresses to do I/O from or to
+ * @ops:	iomap ops passed from the file system
+ *
+ * This function performs read and write operations to directly mapped
+ * persistent memory.  The callers needs to take care of read/write exclusion
+ * and evicting any page cache pages in the region under I/O.
+ */
+ssize_t
+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
+	unsigned flags = 0;
+
+	if (iov_iter_rw(iter) == WRITE)
+		flags |= IOMAP_WRITE;
+
+	/*
+	 * Yes, even DAX files can have page cache attached to them:  A zeroed
+	 * page is inserted into the pagecache when we have to serve a write
+	 * fault on a hole.  It should never be dirtied and can simply be
+	 * dropped from the pagecache once we get real data for the page.
+	 *
+	 * XXX: This is racy against mmap, and there's nothing we can do about
+	 * it. We'll eventually need to shift this down even further so that
+	 * we can check if we allocated blocks over a hole first.
+	 */
+	if (mapping->nrpages) {
+		ret = invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_SHIFT,
+				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+	}
+
+	while (iov_iter_count(iter)) {
+		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
+				iter, iomap_dax_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		done += ret;
+	}
+
+	iocb->ki_pos += done;
+	return done ? done : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_rw);
+
+/**
+ * iomap_dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in their fault
+ * or mkwrite handler for DAX files. Assumes the caller has done all the
+ * necessary locking for the page fault to proceed successfully.
+ */
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			struct iomap_ops *ops)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
+	sector_t sector;
+	struct iomap iomap = { 0 };
+	unsigned flags = 0;
+	int error, major = 0;
+	void *entry;
+
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is supposed
+	 * to hold locks serializing us with truncate / punch hole so this is
+	 * a reliable test.
+	 */
+	if (pos >= i_size_read(inode))
+		return VM_FAULT_SIGBUS;
+
+	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	if (IS_ERR(entry)) {
+		error = PTR_ERR(entry);
+		goto out;
+	}
+
+	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+		flags |= IOMAP_WRITE;
+
+	/*
+	 * Note that we don't bother to use iomap_apply here: DAX required
+	 * the file system block size to be equal the page size, which means
+	 * that we never have to deal with more than a single extent here.
+	 */
+	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+	if (error)
+		goto unlock_entry;
+	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+		error = -EIO;		/* fs corruption? */
+		goto unlock_entry;
+	}
+
+	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+
+	if (vmf->cow_page) {
+		switch (iomap.type) {
+		case IOMAP_HOLE:
+		case IOMAP_UNWRITTEN:
+			clear_user_highpage(vmf->cow_page, vaddr);
+			break;
+		case IOMAP_MAPPED:
+			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
+					vmf->cow_page, vaddr);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			error = -EIO;
+			break;
+		}
+
+		if (error)
+			goto unlock_entry;
+		if (!radix_tree_exceptional_entry(entry)) {
+			vmf->page = entry;
+			return VM_FAULT_LOCKED;
+		}
+		vmf->entry = entry;
+		return VM_FAULT_DAX_LOCKED;
+	}
+
+	switch (iomap.type) {
+	case IOMAP_MAPPED:
+		if (iomap.flags & IOMAP_F_NEW) {
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+		}
+		error = dax_insert_mapping(mapping, iomap.bdev, sector,
+				PAGE_SIZE, &entry, vma, vmf);
+		break;
+	case IOMAP_UNWRITTEN:
+	case IOMAP_HOLE:
+		if (!(vmf->flags & FAULT_FLAG_WRITE))
+			return dax_load_hole(mapping, entry, vmf);
+		/*FALLTHRU*/
+	default:
+		WARN_ON_ONCE(1);
+		error = -EIO;
+		break;
+	}
+
+ unlock_entry:
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM | major;
+	/* -EBUSY is fine, somebody else faulted on the same PTE */
+	if (error < 0 && error != -EBUSY)
+		return VM_FAULT_SIGBUS | major;
+	return VM_FAULT_NOPAGE | major;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_fault);
+#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 592059f88e04..354e2ab62031 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
 
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
-#define REAL_FOPS_DEREF(dentry)					\
-	((const struct file_operations *)(dentry)->d_fsdata)
-
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
 	const struct dentry *dentry = F_DENTRY(filp);
@@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
-	real_fops = REAL_FOPS_DEREF(dentry);
+	real_fops = debugfs_real_fops(filp);
 	real_fops = fops_get(real_fops);
 	if (!real_fops) {
 		/* Huh? Module did not clean up after itself at exit? */
@@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto)				\
 {									\
 	const struct dentry *dentry = F_DENTRY(filp);			\
 	const struct file_operations *real_fops =			\
-		REAL_FOPS_DEREF(dentry);				\
+		debugfs_real_fops(filp);				\
 	int srcu_idx;							\
 	ret_type r;							\
 									\
@@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp,
 				struct poll_table_struct *wait)
 {
 	const struct dentry *dentry = F_DENTRY(filp);
-	const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+	const struct file_operations *real_fops = debugfs_real_fops(filp);
 	int srcu_idx;
 	unsigned int r = 0;
 
@@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp,
 static int full_proxy_release(struct inode *inode, struct file *filp)
 {
 	const struct dentry *dentry = F_DENTRY(filp);
-	const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+	const struct file_operations *real_fops = debugfs_real_fops(filp);
 	const struct file_operations *proxy_fops = filp->f_op;
 	int r = 0;
 
@@ -209,7 +206,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
 	replace_fops(filp, d_inode(dentry)->i_fop);
 	kfree((void *)proxy_fops);
 	fops_put(real_fops);
-	return 0;
+	return r;
 }
 
 static void __full_proxy_fops_init(struct file_operations *proxy_fops,
@@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
-	real_fops = REAL_FOPS_DEREF(dentry);
+	real_fops = debugfs_real_fops(filp);
 	real_fops = fops_get(real_fops);
 	if (!real_fops) {
 		/* Huh? Module did not cleanup after itself at exit? */
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index bba52634b995..b3e8443a1f47 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,8 +19,4 @@ extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
 extern const struct file_operations debugfs_full_proxy_file_operations;
 
-struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
-					struct dentry *parent, void *data,
-					const struct file_operations *fops);
-
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 79a5941c2474..442d1a7e671b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb)
 	struct dentry *root = sb->s_root;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
-	kuid_t root_uid;
-	kgid_t root_gid;
-
-	root_uid = make_kuid(current_user_ns(), 0);
-	root_gid = make_kgid(current_user_ns(), 0);
-	if (!uid_valid(root_uid) || !gid_valid(root_gid))
-		return -EINVAL;
+	kuid_t ptmx_uid = current_fsuid();
+	kgid_t ptmx_gid = current_fsgid();
 
 	inode_lock(d_inode(root));
 
@@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb)
 
 	mode = S_IFCHR|opts->ptmxmode;
 	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
-	inode->i_uid = root_uid;
-	inode->i_gid = root_gid;
+	inode->i_uid = ptmx_uid;
+	inode->i_gid = ptmx_gid;
 
 	d_add(dentry, inode);
 
@@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
 
-	sync_filesystem(sb);
 	err = parse_mount_options(data, PARSE_REMOUNT, opts);
 
 	/*
@@ -395,6 +389,7 @@ static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode *inode;
+	int error;
 
 	s->s_iflags &= ~SB_I_NODEV;
 	s->s_blocksize = 1024;
@@ -403,10 +398,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_op = &devpts_sops;
 	s->s_time_gran = 1;
 
+	error = -ENOMEM;
 	s->s_fs_info = new_pts_fs_info(s);
 	if (!s->s_fs_info)
 		goto fail;
 
+	error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
+	if (error)
+		goto fail;
+
+	error = -ENOMEM;
 	inode = new_inode(s);
 	if (!inode)
 		goto fail;
@@ -418,13 +419,21 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	set_nlink(inode, 2);
 
 	s->s_root = d_make_root(inode);
-	if (s->s_root)
-		return 0;
+	if (!s->s_root) {
+		pr_err("get root dentry failed\n");
+		goto fail;
+	}
 
-	pr_err("get root dentry failed\n");
+	error = mknod_ptmx(s);
+	if (error)
+		goto fail_dput;
 
+	return 0;
+fail_dput:
+	dput(s->s_root);
+	s->s_root = NULL;
 fail:
-	return -ENOMEM;
+	return error;
 }
 
 /*
@@ -436,43 +445,15 @@ fail:
 static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	int error;
-	struct pts_mount_opts opts;
-	struct super_block *s;
-
-	error = parse_mount_options(data, PARSE_MOUNT, &opts);
-	if (error)
-		return ERR_PTR(error);
-
-	s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	if (!s->s_root) {
-		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-		if (error)
-			goto out_undo_sget;
-		s->s_flags |= MS_ACTIVE;
-	}
-
-	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
-
-	error = mknod_ptmx(s);
-	if (error)
-		goto out_undo_sget;
-
-	return dget(s->s_root);
-
-out_undo_sget:
-	deactivate_locked_super(s);
-	return ERR_PTR(error);
+	return mount_nodev(fs_type, flags, data, devpts_fill_super);
 }
 
 static void devpts_kill_sb(struct super_block *sb)
 {
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 
-	ida_destroy(&fsi->allocated_ptys);
+	if (fsi)
+		ida_destroy(&fsi->allocated_ptys);
 	kfree(fsi);
 	kill_litter_super(sb);
 }
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 1d73fc6dba13..cbb50cadcffc 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
 
 	inode->i_private = var;
 
-	efivar_entry_add(var, &efivarfs_list);
+	err = efivar_entry_add(var, &efivarfs_list);
+	if (err)
+		goto out;
+
 	d_instantiate(dentry, inode);
 	dget(dentry);
 out:
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 688ccc16b702..d7a7c53803c1 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -157,12 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 		goto fail_inode;
 	}
 
+	efivar_entry_size(entry, &size);
+	err = efivar_entry_add(entry, &efivarfs_list);
+	if (err)
+		goto fail_inode;
+
 	/* copied by the above to local storage in the dentry. */
 	kfree(name);
 
-	efivar_entry_size(entry, &size);
-	efivar_entry_add(entry, &efivarfs_list);
-
 	inode_lock(inode);
 	inode->i_private = entry;
 	i_size_write(inode, size + sizeof(entry->var.Attributes));
@@ -182,7 +184,10 @@ fail:
 
 static int efivarfs_destroy(struct efivar_entry *entry, void *data)
 {
-	efivar_entry_remove(entry);
+	int err = efivar_entry_remove(entry);
+
+	if (err)
+		return err;
 	kfree(entry);
 	return 0;
 }
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index c634874e12d9..36bea5adcaba 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
 config EXT2_FS
 	tristate "Second extended fs support"
+	select FS_IOMAP if FS_DAX
 	help
 	  Ext2 is a standard Linux file system for hard disks.
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 06af2f92226c..37e2be784ac7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
 extern const struct address_space_operations ext2_nobh_aops;
+extern struct iomap_ops ext2_iomap_ops;
 
 /* namei.c */
 extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5efeefe17abb..423cc01c9d41 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
 #include <linux/pagemap.h>
 #include <linux/dax.h>
 #include <linux/quotaops.h>
+#include <linux/iomap.h>
+#include <linux/uio.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
 
 #ifdef CONFIG_FS_DAX
+static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	ssize_t ret;
+
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+	inode_lock_shared(inode);
+	ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+	inode_unlock_shared(inode);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	inode_lock(inode);
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out_unlock;
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out_unlock;
+	ret = file_update_time(file);
+	if (ret)
+		goto out_unlock;
+
+	ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+		i_size_write(inode, iocb->ki_pos);
+		mark_inode_dirty(inode);
+	}
+
+out_unlock:
+	inode_unlock(inode);
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
+}
+
 /*
  * The lock ordering for ext2 DAX fault paths is:
  *
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_fault(vma, vmf, ext2_get_block);
+	ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	return ret;
 }
 
-/*
- * We have mostly NULL's here: the current defaults are ok for
- * the ext2 filesystem.
- */
+static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
+		return ext2_dax_read_iter(iocb, to);
+#endif
+	return generic_file_read_iter(iocb, to);
+}
+
+static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
+		return ext2_dax_write_iter(iocb, from);
+#endif
+	return generic_file_write_iter(iocb, from);
+}
+
 const struct file_operations ext2_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read_iter	= generic_file_read_iter,
-	.write_iter	= generic_file_write_iter,
+	.read_iter	= ext2_file_read_iter,
+	.write_iter	= ext2_file_write_iter,
 	.unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index efe5fb21c533..04e73a99902b 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -465,6 +465,11 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		gdp = ext2_get_group_desc(sb, group, &bh2);
+		if (!gdp) {
+			if (++group == sbi->s_groups_count)
+				group = 0;
+			continue;
+		}
 		brelse(bitmap_bh);
 		bitmap_bh = read_inode_bitmap(sb, group);
 		if (!bitmap_bh) {
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index d5c7d09919f3..1e72d425fd3b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include "ext2.h"
@@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,
  */
 static int ext2_get_blocks(struct inode *inode,
 			   sector_t iblock, unsigned long maxblocks,
-			   struct buffer_head *bh_result,
+			   u32 *bno, bool *new, bool *boundary,
 			   int create)
 {
 	int err = -EIO;
@@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
-		clear_buffer_new(bh_result); /* What's this do? */
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
 			mutex_unlock(&ei->truncate_mutex);
 			if (err)
 				goto cleanup;
-			clear_buffer_new(bh_result);
 			goto got_it;
 		}
 	}
@@ -733,6 +732,16 @@ static int ext2_get_blocks(struct inode *inode,
 	}
 
 	if (IS_DAX(inode)) {
+		int i;
+
+		/*
+		 * We must unmap blocks before zeroing so that writeback cannot
+		 * overwrite zeros with stale data from block device page cache.
+		 */
+		for (i = 0; i < count; i++) {
+			unmap_underlying_metadata(inode->i_sb->s_bdev,
+					le32_to_cpu(chain[depth-1].key) + i);
+		}
 		/*
 		 * block must be initialised before we put it in the tree
 		 * so that it's not found by another thread before it's
@@ -745,15 +754,16 @@ static int ext2_get_blocks(struct inode *inode,
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
 		}
-	} else
-		set_buffer_new(bh_result);
+	} else {
+		*new = true;
+	}
 
 	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
 	mutex_unlock(&ei->truncate_mutex);
 got_it:
-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	*bno = le32_to_cpu(chain[depth-1].key);
 	if (count > blocks_to_boundary)
-		set_buffer_boundary(bh_result);
+		*boundary = true;
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
@@ -765,19 +775,82 @@ cleanup:
 	return err;
 }
 
-int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+int ext2_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create)
 {
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	int ret = ext2_get_blocks(inode, iblock, max_blocks,
-			      bh_result, create);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
+	bool new = false, boundary = false;
+	u32 bno;
+	int ret;
+
+	ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
+			create);
+	if (ret <= 0)
+		return ret;
+
+	map_bh(bh_result, inode->i_sb, bno);
+	bh_result->b_size = (ret << inode->i_blkbits);
+	if (new)
+		set_buffer_new(bh_result);
+	if (boundary)
+		set_buffer_boundary(bh_result);
+	return 0;
+
+}
+
+#ifdef CONFIG_FS_DAX
+static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+		unsigned flags, struct iomap *iomap)
+{
+	unsigned int blkbits = inode->i_blkbits;
+	unsigned long first_block = offset >> blkbits;
+	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+	bool new = false, boundary = false;
+	u32 bno;
+	int ret;
+
+	ret = ext2_get_blocks(inode, first_block, max_blocks,
+			&bno, &new, &boundary, flags & IOMAP_WRITE);
+	if (ret < 0)
+		return ret;
+
+	iomap->flags = 0;
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = (u64)first_block << blkbits;
+
+	if (ret == 0) {
+		iomap->type = IOMAP_HOLE;
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->length = 1 << blkbits;
+	} else {
+		iomap->type = IOMAP_MAPPED;
+		iomap->blkno = (sector_t)bno << (blkbits - 9);
+		iomap->length = (u64)ret << blkbits;
+		iomap->flags |= IOMAP_F_MERGED;
 	}
-	return ret;
 
+	if (new)
+		iomap->flags |= IOMAP_F_NEW;
+	return 0;
 }
 
+static int
+ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+		ssize_t written, unsigned flags, struct iomap *iomap)
+{
+	if (iomap->type == IOMAP_MAPPED &&
+	    written < length &&
+	    (flags & IOMAP_WRITE))
+		ext2_write_failed(inode->i_mapping, offset + length);
+	return 0;
+}
+
+struct iomap_ops ext2_iomap_ops = {
+	.iomap_begin		= ext2_iomap_begin,
+	.iomap_end		= ext2_iomap_end,
+};
+#endif /* CONFIG_FS_DAX */
+
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len)
 {
@@ -863,11 +936,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
-	if (IS_DAX(inode))
-		ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
-				DIO_LOCKING);
-	else
-		ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
+	if (WARN_ON_ONCE(IS_DAX(inode)))
+		return -EIO;
+
+	ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		ext2_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 10686fd67fb4..1bb7df5e4536 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -776,7 +776,7 @@ resizefs_out:
 				   (struct fscrypt_policy __user *)arg,
 				   sizeof(policy)))
 			return -EFAULT;
-		return fscrypt_process_policy(inode, &policy);
+		return fscrypt_process_policy(filp, &policy);
 #else
 		return -EOPNOTSUPP;
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 47abb96098e4..28f4f4cbb8d8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
-	int ret;
 
 	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
 							sizeof(policy)))
 		return -EFAULT;
 
-	ret = mnt_want_write_file(filp);
-	if (ret)
-		return ret;
-
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-	ret = fscrypt_process_policy(inode, &policy);
 
-	mnt_drop_write_file(filp);
-	return ret;
+	return fscrypt_process_policy(filp, &policy);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 1b2f6c2c3aaf..76f09ce7e5b2 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -1,5 +1,6 @@
 config FUSE_FS
 	tristate "FUSE (Filesystem in Userspace) support"
+	select FS_POSIX_ACL
 	help
 	  With FUSE it is possible to implement a fully functional filesystem
 	  in a userspace program.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index e95eeb445e58..60da84a86dab 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,4 +5,4 @@
 obj-$(CONFIG_FUSE_FS) += fuse.o
 obj-$(CONFIG_CUSE) += cuse.o
 
-fuse-objs := dev.o dir.o file.o inode.o control.o
+fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
new file mode 100644
index 000000000000..ec85765502f1
--- /dev/null
+++ b/fs/fuse/acl.c
@@ -0,0 +1,99 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com>
+ *
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+
+struct posix_acl *fuse_get_acl(struct inode *inode, int type)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int size;
+	const char *name;
+	void *value = NULL;
+	struct posix_acl *acl;
+
+	if (!fc->posix_acl || fc->no_getxattr)
+		return NULL;
+
+	if (type == ACL_TYPE_ACCESS)
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+	else if (type == ACL_TYPE_DEFAULT)
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+	else
+		return ERR_PTR(-EOPNOTSUPP);
+
+	value = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!value)
+		return ERR_PTR(-ENOMEM);
+	size = fuse_getxattr(inode, name, value, PAGE_SIZE);
+	if (size > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	else if ((size == 0) || (size == -ENODATA) ||
+		 (size == -EOPNOTSUPP && fc->no_getxattr))
+		acl = NULL;
+	else if (size == -ERANGE)
+		acl = ERR_PTR(-E2BIG);
+	else
+		acl = ERR_PTR(size);
+
+	kfree(value);
+	return acl;
+}
+
+int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	const char *name;
+	int ret;
+
+	if (!fc->posix_acl || fc->no_setxattr)
+		return -EOPNOTSUPP;
+
+	if (type == ACL_TYPE_ACCESS)
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+	else if (type == ACL_TYPE_DEFAULT)
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+	else
+		return -EINVAL;
+
+	if (acl) {
+		/*
+		 * Fuse userspace is responsible for updating access
+		 * permissions in the inode, if needed. fuse_setxattr
+		 * invalidates the inode attributes, which will force
+		 * them to be refreshed the next time they are used,
+		 * and it also updates i_ctime.
+		 */
+		size_t size = posix_acl_xattr_size(acl->a_count);
+		void *value;
+
+		if (size > PAGE_SIZE)
+			return -E2BIG;
+
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+
+		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (ret < 0) {
+			kfree(value);
+			return ret;
+		}
+
+		ret = fuse_setxattr(inode, name, value, size, 0);
+		kfree(value);
+	} else {
+		ret = fuse_removexattr(inode, name);
+	}
+	forget_all_cached_acls(inode);
+	fuse_invalidate_attr(inode);
+
+	return ret;
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a94d2ed81ab4..c41bde26c338 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -767,7 +767,6 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		cs->len = err;
 		cs->offset = off;
 		cs->pg = page;
-		cs->offset = off;
 		iov_iter_advance(cs->iter, err);
 	}
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c47b7780ce37..f7c84ab835ca 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,6 +13,8 @@
 #include <linux/sched.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
 
 static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
@@ -37,47 +39,39 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
 	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
 }
 
-#if BITS_PER_LONG >= 64
+union fuse_dentry {
+	u64 time;
+	struct rcu_head rcu;
+};
+
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
 {
-	entry->d_time = time;
+	((union fuse_dentry *) entry->d_fsdata)->time = time;
 }
 
 static inline u64 fuse_dentry_time(struct dentry *entry)
 {
-	return entry->d_time;
-}
-#else
-/*
- * On 32 bit archs store the high 32 bits of time in d_fsdata
- */
-static void fuse_dentry_settime(struct dentry *entry, u64 time)
-{
-	entry->d_time = time;
-	entry->d_fsdata = (void *) (unsigned long) (time >> 32);
-}
-
-static u64 fuse_dentry_time(struct dentry *entry)
-{
-	return (u64) entry->d_time +
-		((u64) (unsigned long) entry->d_fsdata << 32);
+	return ((union fuse_dentry *) entry->d_fsdata)->time;
 }
-#endif
 
 /*
  * FUSE caches dentries and attributes with separate timeout.  The
  * time in jiffies until the dentry/attributes are valid is stored in
- * dentry->d_time and fuse_inode->i_time respectively.
+ * dentry->d_fsdata and fuse_inode->i_time respectively.
  */
 
 /*
  * Calculate the time in jiffies until a dentry/attributes are valid
  */
-static u64 time_to_jiffies(unsigned long sec, unsigned long nsec)
+static u64 time_to_jiffies(u64 sec, u32 nsec)
 {
 	if (sec || nsec) {
-		struct timespec ts = {sec, nsec};
-		return get_jiffies_64() + timespec_to_jiffies(&ts);
+		struct timespec64 ts = {
+			sec,
+			max_t(u32, nsec, NSEC_PER_SEC - 1)
+		};
+
+		return get_jiffies_64() + timespec64_to_jiffies(&ts);
 	} else
 		return 0;
 }
@@ -243,6 +237,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 		if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
 			goto invalid;
 
+		forget_all_cached_acls(inode);
 		fuse_change_attributes(inode, &outarg.attr,
 				       entry_attr_timeout(&outarg),
 				       attr_version);
@@ -272,8 +267,23 @@ static int invalid_nodeid(u64 nodeid)
 	return !nodeid || nodeid == FUSE_ROOT_ID;
 }
 
+static int fuse_dentry_init(struct dentry *dentry)
+{
+	dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+
+	return dentry->d_fsdata ? 0 : -ENOMEM;
+}
+static void fuse_dentry_release(struct dentry *dentry)
+{
+	union fuse_dentry *fd = dentry->d_fsdata;
+
+	kfree_rcu(fd, rcu);
+}
+
 const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
+	.d_init		= fuse_dentry_init,
+	.d_release	= fuse_dentry_release,
 };
 
 int fuse_valid_type(int m)
@@ -634,7 +644,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 	return create_new_entry(fc, &args, dir, entry, S_IFLNK);
 }
 
-static inline void fuse_update_ctime(struct inode *inode)
+void fuse_update_ctime(struct inode *inode)
 {
 	if (!IS_NOCMTIME(inode)) {
 		inode->i_ctime = current_fs_time(inode->i_sb);
@@ -917,6 +927,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 
 	if (time_before64(fi->i_time, get_jiffies_64())) {
 		r = true;
+		forget_all_cached_acls(inode);
 		err = fuse_do_getattr(inode, stat, file);
 	} else {
 		r = false;
@@ -1017,7 +1028,7 @@ int fuse_allow_current_process(struct fuse_conn *fc)
 {
 	const struct cred *cred;
 
-	if (fc->flags & FUSE_ALLOW_OTHER)
+	if (fc->allow_other)
 		return 1;
 
 	cred = current_cred();
@@ -1064,6 +1075,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
+	forget_all_cached_acls(inode);
 	return fuse_do_getattr(inode, NULL, NULL);
 }
 
@@ -1092,7 +1104,7 @@ static int fuse_permission(struct inode *inode, int mask)
 	/*
 	 * If attributes are needed, refresh them before proceeding
 	 */
-	if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
+	if (fc->default_permissions ||
 	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
 
@@ -1105,7 +1117,7 @@ static int fuse_permission(struct inode *inode, int mask)
 		}
 	}
 
-	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+	if (fc->default_permissions) {
 		err = generic_permission(inode, mask);
 
 		/* If permission is denied, try to refresh file
@@ -1233,6 +1245,7 @@ retry:
 		fi->nlookup++;
 		spin_unlock(&fc->lock);
 
+		forget_all_cached_acls(inode);
 		fuse_change_attributes(inode, &o->attr,
 				       entry_attr_timeout(o),
 				       attr_version);
@@ -1605,7 +1618,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	int err;
 	bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
 
-	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+	if (!fc->default_permissions)
 		attr->ia_valid |= ATTR_FORCE;
 
 	err = inode_change_ok(inode, attr);
@@ -1702,172 +1715,75 @@ error:
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(entry);
-
-	if (!fuse_allow_current_process(get_fuse_conn(inode)))
-		return -EACCES;
-
-	if (attr->ia_valid & ATTR_FILE)
-		return fuse_do_setattr(inode, attr, attr->ia_file);
-	else
-		return fuse_do_setattr(inode, attr, NULL);
-}
-
-static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
-			struct kstat *stat)
-{
-	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL;
+	int ret;
 
-	if (!fuse_allow_current_process(fc))
+	if (!fuse_allow_current_process(get_fuse_conn(inode)))
 		return -EACCES;
 
-	return fuse_update_attributes(inode, stat, NULL, NULL);
-}
-
-static int fuse_setxattr(struct dentry *unused, struct inode *inode,
-			 const char *name, const void *value,
-			 size_t size, int flags)
-{
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	FUSE_ARGS(args);
-	struct fuse_setxattr_in inarg;
-	int err;
-
-	if (fc->no_setxattr)
-		return -EOPNOTSUPP;
+	if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) {
+		attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID |
+				    ATTR_MODE);
 
-	memset(&inarg, 0, sizeof(inarg));
-	inarg.size = size;
-	inarg.flags = flags;
-	args.in.h.opcode = FUSE_SETXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 3;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = strlen(name) + 1;
-	args.in.args[1].value = name;
-	args.in.args[2].size = size;
-	args.in.args[2].value = value;
-	err = fuse_simple_request(fc, &args);
-	if (err == -ENOSYS) {
-		fc->no_setxattr = 1;
-		err = -EOPNOTSUPP;
-	}
-	if (!err) {
-		fuse_invalidate_attr(inode);
-		fuse_update_ctime(inode);
+		/*
+		 * The only sane way to reliably kill suid/sgid is to do it in
+		 * the userspace filesystem
+		 *
+		 * This should be done on write(), truncate() and chown().
+		 */
+		if (!fc->handle_killpriv) {
+			int kill;
+
+			/*
+			 * ia_mode calculation may have used stale i_mode.
+			 * Refresh and recalculate.
+			 */
+			ret = fuse_do_getattr(inode, NULL, file);
+			if (ret)
+				return ret;
+
+			attr->ia_mode = inode->i_mode;
+			kill = should_remove_suid(entry);
+			if (kill & ATTR_KILL_SUID) {
+				attr->ia_valid |= ATTR_MODE;
+				attr->ia_mode &= ~S_ISUID;
+			}
+			if (kill & ATTR_KILL_SGID) {
+				attr->ia_valid |= ATTR_MODE;
+				attr->ia_mode &= ~S_ISGID;
+			}
+		}
 	}
-	return err;
-}
-
-static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode,
-			     const char *name, void *value, size_t size)
-{
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	FUSE_ARGS(args);
-	struct fuse_getxattr_in inarg;
-	struct fuse_getxattr_out outarg;
-	ssize_t ret;
+	if (!attr->ia_valid)
+		return 0;
 
-	if (fc->no_getxattr)
-		return -EOPNOTSUPP;
+	ret = fuse_do_setattr(inode, attr, file);
+	if (!ret) {
+		/*
+		 * If filesystem supports acls it may have updated acl xattrs in
+		 * the filesystem, so forget cached acls for the inode.
+		 */
+		if (fc->posix_acl)
+			forget_all_cached_acls(inode);
 
-	memset(&inarg, 0, sizeof(inarg));
-	inarg.size = size;
-	args.in.h.opcode = FUSE_GETXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = strlen(name) + 1;
-	args.in.args[1].value = name;
-	/* This is really two different operations rolled into one */
-	args.out.numargs = 1;
-	if (size) {
-		args.out.argvar = 1;
-		args.out.args[0].size = size;
-		args.out.args[0].value = value;
-	} else {
-		args.out.args[0].size = sizeof(outarg);
-		args.out.args[0].value = &outarg;
-	}
-	ret = fuse_simple_request(fc, &args);
-	if (!ret && !size)
-		ret = outarg.size;
-	if (ret == -ENOSYS) {
-		fc->no_getxattr = 1;
-		ret = -EOPNOTSUPP;
+		/* Directory mode changed, may need to revalidate access */
+		if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE))
+			fuse_invalidate_entry_cache(entry);
 	}
 	return ret;
 }
 
-static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+			struct kstat *stat)
 {
 	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	FUSE_ARGS(args);
-	struct fuse_getxattr_in inarg;
-	struct fuse_getxattr_out outarg;
-	ssize_t ret;
 
 	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
-	if (fc->no_listxattr)
-		return -EOPNOTSUPP;
-
-	memset(&inarg, 0, sizeof(inarg));
-	inarg.size = size;
-	args.in.h.opcode = FUSE_LISTXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	/* This is really two different operations rolled into one */
-	args.out.numargs = 1;
-	if (size) {
-		args.out.argvar = 1;
-		args.out.args[0].size = size;
-		args.out.args[0].value = list;
-	} else {
-		args.out.args[0].size = sizeof(outarg);
-		args.out.args[0].value = &outarg;
-	}
-	ret = fuse_simple_request(fc, &args);
-	if (!ret && !size)
-		ret = outarg.size;
-	if (ret == -ENOSYS) {
-		fc->no_listxattr = 1;
-		ret = -EOPNOTSUPP;
-	}
-	return ret;
-}
-
-static int fuse_removexattr(struct dentry *entry, const char *name)
-{
-	struct inode *inode = d_inode(entry);
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	FUSE_ARGS(args);
-	int err;
-
-	if (fc->no_removexattr)
-		return -EOPNOTSUPP;
-
-	args.in.h.opcode = FUSE_REMOVEXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = strlen(name) + 1;
-	args.in.args[0].value = name;
-	err = fuse_simple_request(fc, &args);
-	if (err == -ENOSYS) {
-		fc->no_removexattr = 1;
-		err = -EOPNOTSUPP;
-	}
-	if (!err) {
-		fuse_invalidate_attr(inode);
-		fuse_update_ctime(inode);
-	}
-	return err;
+	return fuse_update_attributes(inode, stat, NULL, NULL);
 }
 
 static const struct inode_operations fuse_dir_inode_operations = {
@@ -1884,10 +1800,12 @@ static const struct inode_operations fuse_dir_inode_operations = {
 	.mknod		= fuse_mknod,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
-	.setxattr	= fuse_setxattr,
-	.getxattr	= fuse_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= fuse_listxattr,
-	.removexattr	= fuse_removexattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= fuse_get_acl,
+	.set_acl	= fuse_set_acl,
 };
 
 static const struct file_operations fuse_dir_operations = {
@@ -1905,10 +1823,12 @@ static const struct inode_operations fuse_common_inode_operations = {
 	.setattr	= fuse_setattr,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
-	.setxattr	= fuse_setxattr,
-	.getxattr	= fuse_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= fuse_listxattr,
-	.removexattr	= fuse_removexattr,
+	.removexattr	= generic_removexattr,
+	.get_acl	= fuse_get_acl,
+	.set_acl	= fuse_set_acl,
 };
 
 static const struct inode_operations fuse_symlink_inode_operations = {
@@ -1916,10 +1836,10 @@ static const struct inode_operations fuse_symlink_inode_operations = {
 	.get_link	= fuse_get_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
-	.setxattr	= fuse_setxattr,
-	.getxattr	= fuse_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= fuse_listxattr,
-	.removexattr	= fuse_removexattr,
+	.removexattr	= generic_removexattr,
 };
 
 void fuse_init_common(struct inode *inode)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f394aff59c36..b7beb67bf005 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
 	req->out.args[0].size = count;
 }
 
-static void fuse_release_user_pages(struct fuse_req *req, int write)
+static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
 {
 	unsigned i;
 
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
-		if (write)
+		if (should_dirty)
 			set_page_dirty_lock(page);
 		put_page(page);
 	}
@@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 		       loff_t *ppos, int flags)
 {
 	int write = flags & FUSE_DIO_WRITE;
+	bool should_dirty = !write && iter_is_iovec(iter);
 	int cuse = flags & FUSE_DIO_CUSE;
 	struct file *file = io->file;
 	struct inode *inode = file->f_mapping->host;
@@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 			nres = fuse_send_read(req, io, pos, nbytes, owner);
 
 		if (!io->async)
-			fuse_release_user_pages(req, !write);
+			fuse_release_user_pages(req, should_dirty);
 		if (req->out.h.error) {
 			err = req->out.h.error;
 			break;
@@ -2325,49 +2326,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 	return retval;
 }
 
-static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
-			unsigned int nr_segs, size_t bytes, bool to_user)
-{
-	struct iov_iter ii;
-	int page_idx = 0;
-
-	if (!bytes)
-		return 0;
-
-	iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
-
-	while (iov_iter_count(&ii)) {
-		struct page *page = pages[page_idx++];
-		size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
-		void *kaddr;
-
-		kaddr = kmap(page);
-
-		while (todo) {
-			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
-			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
-			size_t copy = min(todo, iov_len);
-			size_t left;
-
-			if (!to_user)
-				left = copy_from_user(kaddr, uaddr, copy);
-			else
-				left = copy_to_user(uaddr, kaddr, copy);
-
-			if (unlikely(left))
-				return -EFAULT;
-
-			iov_iter_advance(&ii, copy);
-			todo -= copy;
-			kaddr += copy;
-		}
-
-		kunmap(page);
-	}
-
-	return 0;
-}
-
 /*
  * CUSE servers compiled on 32bit broke on 64bit kernels because the
  * ABI was defined to be 'struct iovec' which is different on 32bit
@@ -2519,8 +2477,9 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	struct iovec *iov_page = NULL;
 	struct iovec *in_iov = NULL, *out_iov = NULL;
 	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
-	size_t in_size, out_size, transferred;
-	int err;
+	size_t in_size, out_size, transferred, c;
+	int err, i;
+	struct iov_iter ii;
 
 #if BITS_PER_LONG == 32
 	inarg.flags |= FUSE_IOCTL_32BIT;
@@ -2602,10 +2561,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		req->in.args[1].size = in_size;
 		req->in.argpages = 1;
 
-		err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
-					   false);
-		if (err)
-			goto out;
+		err = -EFAULT;
+		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
+			c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii);
+			if (c != PAGE_SIZE && iov_iter_count(&ii))
+				goto out;
+		}
 	}
 
 	req->out.numargs = 2;
@@ -2671,7 +2633,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	if (transferred > inarg.out_size)
 		goto out;
 
-	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+	err = -EFAULT;
+	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
+		c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii);
+		if (c != PAGE_SIZE && iov_iter_count(&ii))
+			goto out;
+	}
+	err = 0;
  out:
 	if (req)
 		fuse_put_request(fc, req);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d98d8cc84def..24ada5dc4dae 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -23,6 +23,7 @@
 #include <linux/poll.h>
 #include <linux/workqueue.h>
 #include <linux/kref.h>
+#include <linux/xattr.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -36,15 +37,6 @@
 /** Number of dentries for each connection in the control filesystem */
 #define FUSE_CTL_NUM_DENTRIES 5
 
-/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
-    module will check permissions based on the file mode.  Otherwise no
-    permission checking is done in the kernel */
-#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
-
-/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
-    doing the mount will be allowed to access the filesystem */
-#define FUSE_ALLOW_OTHER         (1 << 1)
-
 /** Number of page pointers embedded in fuse_req */
 #define FUSE_REQ_INLINE_PAGES 1
 
@@ -469,9 +461,6 @@ struct fuse_conn {
 	/** The group id for this mount */
 	kgid_t group_id;
 
-	/** The fuse mount flags for this mount */
-	unsigned flags;
-
 	/** Maximum read size */
 	unsigned max_read;
 
@@ -547,6 +536,9 @@ struct fuse_conn {
 	/** allow parallel lookups and readdir (default is serialized) */
 	unsigned parallel_dirops:1;
 
+	/** handle fs handles killing suid/sgid/cap on write/chown/trunc */
+	unsigned handle_killpriv:1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
@@ -624,6 +616,15 @@ struct fuse_conn {
 	/** Is lseek not implemented by fs? */
 	unsigned no_lseek:1;
 
+	/** Does the filesystem support posix acls? */
+	unsigned posix_acl:1;
+
+	/** Check permissions based on the file mode or not? */
+	unsigned default_permissions:1;
+
+	/** Allow other than the mounter user to access the filesystem ? */
+	unsigned allow_other:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -902,6 +903,8 @@ int fuse_allow_current_process(struct fuse_conn *fc);
 
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 
+void fuse_update_ctime(struct inode *inode);
+
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed);
 
@@ -966,4 +969,17 @@ void fuse_set_initialized(struct fuse_conn *fc);
 void fuse_unlock_inode(struct inode *inode);
 void fuse_lock_inode(struct inode *inode);
 
+int fuse_setxattr(struct inode *inode, const char *name, const void *value,
+		  size_t size, int flags);
+ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
+		      size_t size);
+ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
+int fuse_removexattr(struct inode *inode, const char *name);
+extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler *fuse_acl_xattr_handlers[];
+
+struct posix_acl;
+struct posix_acl *fuse_get_acl(struct inode *inode, int type);
+int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4e05b51120f4..17141099f2e7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -20,6 +20,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#include <linux/posix_acl.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -66,7 +67,8 @@ struct fuse_mount_data {
 	unsigned rootmode_present:1;
 	unsigned user_id_present:1;
 	unsigned group_id_present:1;
-	unsigned flags;
+	unsigned default_permissions:1;
+	unsigned allow_other:1;
 	unsigned max_read;
 	unsigned blksize;
 };
@@ -192,7 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	 * check in may_delete().
 	 */
 	fi->orig_i_mode = inode->i_mode;
-	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+	if (!fc->default_permissions)
 		inode->i_mode &= ~S_ISVTX;
 
 	fi->orig_ino = attr->ino;
@@ -340,6 +342,7 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 		return -ENOENT;
 
 	fuse_invalidate_attr(inode);
+	forget_all_cached_acls(inode);
 	if (offset >= 0) {
 		pg_start = offset >> PAGE_SHIFT;
 		if (len <= 0)
@@ -532,11 +535,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 			break;
 
 		case OPT_DEFAULT_PERMISSIONS:
-			d->flags |= FUSE_DEFAULT_PERMISSIONS;
+			d->default_permissions = 1;
 			break;
 
 		case OPT_ALLOW_OTHER:
-			d->flags |= FUSE_ALLOW_OTHER;
+			d->allow_other = 1;
 			break;
 
 		case OPT_MAX_READ:
@@ -570,9 +573,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 
 	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
 	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
-	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+	if (fc->default_permissions)
 		seq_puts(m, ",default_permissions");
-	if (fc->flags & FUSE_ALLOW_OTHER)
+	if (fc->allow_other)
 		seq_puts(m, ",allow_other");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
@@ -910,8 +913,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->writeback_cache = 1;
 			if (arg->flags & FUSE_PARALLEL_DIROPS)
 				fc->parallel_dirops = 1;
+			if (arg->flags & FUSE_HANDLE_KILLPRIV)
+				fc->handle_killpriv = 1;
 			if (arg->time_gran && arg->time_gran <= 1000000000)
 				fc->sb->s_time_gran = arg->time_gran;
+			if ((arg->flags & FUSE_POSIX_ACL)) {
+				fc->default_permissions = 1;
+				fc->posix_acl = 1;
+				fc->sb->s_xattr = fuse_acl_xattr_handlers;
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -941,7 +951,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 		FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
 		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
 		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
-		FUSE_PARALLEL_DIROPS;
+		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -1071,6 +1081,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
+	sb->s_xattr = fuse_xattr_handlers;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_time_gran = 1;
 	sb->s_export_op = &fuse_export_operations;
@@ -1109,7 +1120,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		fc->dont_mask = 1;
 	sb->s_flags |= MS_POSIXACL;
 
-	fc->flags = d.flags;
+	fc->default_permissions = d.default_permissions;
+	fc->allow_other = d.allow_other;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
 	fc->max_read = max_t(unsigned, 4096, d.max_read);
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
new file mode 100644
index 000000000000..3caac46b08b0
--- /dev/null
+++ b/fs/fuse/xattr.c
@@ -0,0 +1,211 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2016  Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+int fuse_setxattr(struct inode *inode, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_setxattr_in inarg;
+	int err;
+
+	if (fc->no_setxattr)
+		return -EOPNOTSUPP;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	inarg.flags = flags;
+	args.in.h.opcode = FUSE_SETXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 3;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = strlen(name) + 1;
+	args.in.args[1].value = name;
+	args.in.args[2].size = size;
+	args.in.args[2].value = value;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_setxattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (!err) {
+		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
+	}
+	return err;
+}
+
+ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
+		      size_t size)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_getxattr)
+		return -EOPNOTSUPP;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	args.in.h.opcode = FUSE_GETXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 2;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.in.args[1].size = strlen(name) + 1;
+	args.in.args[1].value = name;
+	/* This is really two different operations rolled into one */
+	args.out.numargs = 1;
+	if (size) {
+		args.out.argvar = 1;
+		args.out.args[0].size = size;
+		args.out.args[0].value = value;
+	} else {
+		args.out.args[0].size = sizeof(outarg);
+		args.out.args[0].value = &outarg;
+	}
+	ret = fuse_simple_request(fc, &args);
+	if (!ret && !size)
+		ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
+	if (ret == -ENOSYS) {
+		fc->no_getxattr = 1;
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static int fuse_verify_xattr_list(char *list, size_t size)
+{
+	size_t origsize = size;
+
+	while (size) {
+		size_t thislen = strnlen(list, size);
+
+		if (!thislen || thislen == size)
+			return -EIO;
+
+		size -= thislen + 1;
+		list += thislen + 1;
+	}
+
+	return origsize;
+}
+
+ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+	struct inode *inode = d_inode(entry);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (!fuse_allow_current_process(fc))
+		return -EACCES;
+
+	if (fc->no_listxattr)
+		return -EOPNOTSUPP;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	args.in.h.opcode = FUSE_LISTXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	/* This is really two different operations rolled into one */
+	args.out.numargs = 1;
+	if (size) {
+		args.out.argvar = 1;
+		args.out.args[0].size = size;
+		args.out.args[0].value = list;
+	} else {
+		args.out.args[0].size = sizeof(outarg);
+		args.out.args[0].value = &outarg;
+	}
+	ret = fuse_simple_request(fc, &args);
+	if (!ret && !size)
+		ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
+	if (ret > 0 && size)
+		ret = fuse_verify_xattr_list(list, ret);
+	if (ret == -ENOSYS) {
+		fc->no_listxattr = 1;
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+int fuse_removexattr(struct inode *inode, const char *name)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+	int err;
+
+	if (fc->no_removexattr)
+		return -EOPNOTSUPP;
+
+	args.in.h.opcode = FUSE_REMOVEXATTR;
+	args.in.h.nodeid = get_node_id(inode);
+	args.in.numargs = 1;
+	args.in.args[0].size = strlen(name) + 1;
+	args.in.args[0].value = name;
+	err = fuse_simple_request(fc, &args);
+	if (err == -ENOSYS) {
+		fc->no_removexattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	if (!err) {
+		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
+	}
+	return err;
+}
+
+static int fuse_xattr_get(const struct xattr_handler *handler,
+			 struct dentry *dentry, struct inode *inode,
+			 const char *name, void *value, size_t size)
+{
+	return fuse_getxattr(inode, name, value, size);
+}
+
+static int fuse_xattr_set(const struct xattr_handler *handler,
+			  struct dentry *dentry, struct inode *inode,
+			  const char *name, const void *value, size_t size,
+			  int flags)
+{
+	if (!value)
+		return fuse_removexattr(inode, name);
+
+	return fuse_setxattr(inode, name, value, size, flags);
+}
+
+static const struct xattr_handler fuse_xattr_handler = {
+	.prefix = "",
+	.get    = fuse_xattr_get,
+	.set    = fuse_xattr_set,
+};
+
+const struct xattr_handler *fuse_xattr_handlers[] = {
+	&fuse_xattr_handler,
+	NULL
+};
+
+const struct xattr_handler *fuse_acl_xattr_handlers[] = {
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&fuse_xattr_handler,
+	NULL
+};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 82df36886938..5a6f52ea2722 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -187,7 +187,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
 		ClearPageChecked(page);
 		if (!page_has_buffers(page)) {
 			create_empty_buffers(page, inode->i_sb->s_blocksize,
-					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+					     BIT(BH_Dirty)|BIT(BH_Uptodate));
 		}
 		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
 	}
@@ -1147,6 +1147,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 	if (!page_has_buffers(page))
 		return 0;
 
+	/*
+	 * From xfs_vm_releasepage: mm accommodates an old ext3 case where
+	 * clean pages might not have had the dirty bit cleared.  Thus, it can
+	 * send actual dirty pages to ->releasepage() via shrink_active_list().
+	 *
+	 * As a workaround, we skip pages that contain dirty buffers below.
+	 * Once ->releasepage isn't called on dirty pages anymore, we can warn
+	 * on dirty buffers like we used to here again.
+	 */
+
 	gfs2_log_lock(sdp);
 	spin_lock(&sdp->sd_ail_lock);
 	head = bh = page_buffers(page);
@@ -1156,8 +1166,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		bd = bh->b_private;
 		if (bd && bd->bd_tr)
 			goto cannot_release;
-		if (buffer_pinned(bh) || buffer_dirty(bh))
-			goto not_possible;
+		if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh)))
+			goto cannot_release;
 		bh = bh->b_this_page;
 	} while(bh != head);
 	spin_unlock(&sdp->sd_ail_lock);
@@ -1180,9 +1190,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 
 	return try_to_free_buffers(page);
 
-not_possible: /* Should never happen */
-	WARN_ON(buffer_dirty(bh));
-	WARN_ON(buffer_pinned(bh));
 cannot_release:
 	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6e2bec1cd289..645721f3ff00 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -82,8 +82,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 	}
 
 	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits,
-				     (1 << BH_Uptodate));
+		create_empty_buffers(page, BIT(inode->i_blkbits),
+				     BIT(BH_Uptodate));
 
 	bh = page_buffers(page);
 
@@ -690,7 +690,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 	BUG_ON(!dblock);
 	BUG_ON(!new);
 
-	bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
+	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
 	ret = gfs2_block_map(inode, lblock, &bh, create);
 	*extlen = bh.b_size >> inode->i_blkbits;
 	*dblock = bh.b_blocknr;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fcb59b23f1e3..db8fbeb62483 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -351,7 +351,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 	if (hc)
 		return hc;
 
-	hsize = 1 << ip->i_depth;
+	hsize = BIT(ip->i_depth);
 	hsize *= sizeof(__be64);
 	if (hsize != i_size_read(&ip->i_inode)) {
 		gfs2_consist_inode(ip);
@@ -819,8 +819,8 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 
 	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 		struct gfs2_leaf *leaf;
-		unsigned hsize = 1 << ip->i_depth;
-		unsigned index;
+		unsigned int hsize = BIT(ip->i_depth);
+		unsigned int index;
 		u64 ln;
 		if (hsize * sizeof(u64) != i_size_read(inode)) {
 			gfs2_consist_inode(ip);
@@ -932,7 +932,7 @@ static int dir_make_exhash(struct inode *inode)
 		return -ENOSPC;
 	bn = bh->b_blocknr;
 
-	gfs2_assert(sdp, dip->i_entries < (1 << 16));
+	gfs2_assert(sdp, dip->i_entries < BIT(16));
 	leaf->lf_entries = cpu_to_be16(dip->i_entries);
 
 	/*  Copy dirents  */
@@ -1041,7 +1041,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	bn = nbh->b_blocknr;
 
 	/*  Compute the start and len of leaf pointers in the hash table.  */
-	len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
+	len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth));
 	half_len = len >> 1;
 	if (!half_len) {
 		pr_warn("i_depth %u lf_depth %u index %u\n",
@@ -1163,7 +1163,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	int x;
 	int error = 0;
 
-	hsize = 1 << dip->i_depth;
+	hsize = BIT(dip->i_depth);
 	hsize_bytes = hsize * sizeof(__be64);
 
 	hc = gfs2_dir_get_hash_table(dip);
@@ -1539,7 +1539,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx,
 	int error = 0;
 	unsigned depth = 0;
 
-	hsize = 1 << dip->i_depth;
+	hsize = BIT(dip->i_depth);
 	hash = gfs2_dir_offset2hash(ctx->pos);
 	index = hash >> (32 - dip->i_depth);
 
@@ -1558,7 +1558,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx,
 		if (error)
 			break;
 
-		len = 1 << (dip->i_depth - depth);
+		len = BIT(dip->i_depth - depth);
 		index = (index & ~(len - 1)) + len;
 	}
 
@@ -2113,7 +2113,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 	u64 leaf_no;
 	int error = 0, last;
 
-	hsize = 1 << dip->i_depth;
+	hsize = BIT(dip->i_depth);
 
 	lp = gfs2_dir_get_hash_table(dip);
 	if (IS_ERR(lp))
@@ -2126,7 +2126,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 			if (error)
 				goto out;
 			leaf = (struct gfs2_leaf *)bh->b_data;
-			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+			len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth));
 
 			next_index = (index & ~(len - 1)) + len;
 			last = ((next_index >= hsize) ? 1 : 0);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 320e65e61938..360188f162bd 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -395,9 +395,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	sb_start_pagefault(inode->i_sb);
 
-	/* Update file times before taking page lock */
-	file_update_time(vma->vm_file);
-
 	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
 		goto out;
@@ -409,6 +406,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out_uninit;
 
+	/* Update file times before taking page lock */
+	file_update_time(vma->vm_file);
+
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3a90b2b5b9bb..14cbf60167a7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -69,7 +69,7 @@ static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
-#define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_SIZE       BIT(GFS2_GL_HASH_SHIFT)
 
 static struct rhashtable_params ht_parms = {
 	.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
@@ -1781,7 +1781,13 @@ int __init gfs2_glock_init(void)
 		return -ENOMEM;
 	}
 
-	register_shrinker(&glock_shrinker);
+	ret = register_shrinker(&glock_shrinker);
+	if (ret) {
+		destroy_workqueue(gfs2_delete_workqueue);
+		destroy_workqueue(glock_workqueue);
+		rhashtable_destroy(&gl_hash_table);
+		return ret;
+	}
 
 	return 0;
 }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e4da0ecd3285..fb3a810b506f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -187,6 +187,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 		}
 
 		gfs2_set_iop(inode);
+
+		inode->i_atime.tv_sec = 0;
+		inode->i_atime.tv_nsec = 0;
+
 		unlock_new_inode(inode);
 	}
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 7710dfd3af35..aace8ce34a18 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -85,7 +85,7 @@ static inline int gfs2_check_internal_file_size(struct inode *inode,
 	u64 size = i_size_read(inode);
 	if (size < minsize || size > maxsize)
 		goto err;
-	if (size & ((1 << inode->i_blkbits) - 1))
+	if (size & (BIT(inode->i_blkbits) - 1))
 		goto err;
 	return 0;
 err:
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 74fd0139e6c2..67d1fc4668f7 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -145,7 +145,9 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_qadata_cachep)
 		goto fail;
 
-	register_shrinker(&gfs2_qd_shrinker);
+	error = register_shrinker(&gfs2_qd_shrinker);
+	if (error)
+		goto fail;
 
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 950b8be68e41..373639a59782 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -216,23 +216,26 @@ static void gfs2_meta_read_endio(struct bio *bio)
 static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
 			    int num)
 {
-	struct buffer_head *bh = bhs[0];
-	struct bio *bio;
-	int i;
-
-	if (!num)
-		return;
-
-	bio = bio_alloc(GFP_NOIO, num);
-	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
-	for (i = 0; i < num; i++) {
-		bh = bhs[i];
-		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+	while (num > 0) {
+		struct buffer_head *bh = *bhs;
+		struct bio *bio;
+
+		bio = bio_alloc(GFP_NOIO, num);
+		bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+		bio->bi_bdev = bh->b_bdev;
+		while (num > 0) {
+			bh = *bhs;
+			if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
+				BUG_ON(bio->bi_iter.bi_size == 0);
+				break;
+			}
+			bhs++;
+			num--;
+		}
+		bio->bi_end_io = gfs2_meta_read_endio;
+		bio_set_op_attrs(bio, op, op_flags);
+		submit_bio(bio);
 	}
-	bio->bi_end_io = gfs2_meta_read_endio;
-	bio_set_op_attrs(bio, op, op_flags);
-	submit_bio(bio);
 }
 
 /**
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ef1e1822977f..ff72ac6439c8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -58,7 +58,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
 	gt->gt_new_files_jdata = 0;
-	gt->gt_max_readahead = 1 << 18;
+	gt->gt_max_readahead = BIT(18);
 	gt->gt_complain_secs = 10;
 }
 
@@ -284,7 +284,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 
 	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
 			       GFS2_BASIC_BLOCK_SHIFT;
-	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
 	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
 			  sizeof(struct gfs2_dinode)) / sizeof(u64);
 	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
@@ -302,7 +302,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 
 	/* Compute maximum reservation required to add a entry to a directory */
 
-	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+	hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH),
 			     sdp->sd_jbsize);
 
 	ind_blocks = 0;
@@ -1089,7 +1089,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
 	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
                                GFS2_BASIC_BLOCK_SHIFT;
-	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
 
 	sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
 	sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 77930ca25303..8af2dfa09236 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -75,7 +75,7 @@
 #include "util.h"
 
 #define GFS2_QD_HASH_SHIFT      12
-#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
+#define GFS2_QD_HASH_SIZE       BIT(GFS2_QD_HASH_SHIFT)
 #define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
 
 /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
@@ -384,7 +384,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 	block = qd->qd_slot / sdp->sd_qc_per_block;
 	offset = qd->qd_slot % sdp->sd_qc_per_block;
 
-	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	bh_map.b_size = BIT(ip->i_inode.i_blkbits);
 	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
 	if (error)
 		goto fail;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 3a7e60bb39f8..e3ee387a6dfe 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -359,7 +359,7 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 	u64 size = i_size_read(jd->jd_inode);
 
-	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
+	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30)))
 		return -EIO;
 
 	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
diff --git a/fs/internal.h b/fs/internal.h
index ba0737649d4a..859178692ce4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 struct super_block;
 struct file_system_type;
 struct iomap;
+struct iomap_ops;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/*
+ * iomap support:
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+		unsigned flags, struct iomap_ops *ops, void *data,
+		iomap_actor_t actor);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0f56deb24ce6..c415668c86d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_super(sb);
 }
 
-static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+static int ioctl_file_dedupe_range(struct file *file, void __user *arg)
 {
 	struct file_dedupe_range __user *argp = arg;
 	struct file_dedupe_range *same = NULL;
@@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
 	}
 
 	size = offsetof(struct file_dedupe_range __user, info[count]);
+	if (size > PAGE_SIZE) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	same = memdup_user(argp, size);
 	if (IS_ERR(same)) {
diff --git a/fs/iomap.c b/fs/iomap.c
index 706270f21b35..013d1d36fbbf 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
 #include <linux/dax.h>
 #include "internal.h"
 
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-		void *data, struct iomap *iomap);
-
 /*
  * Execute a iomap write on a segment of the mapping that spans a
  * contiguous range of pages that have identical block mapping state.
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
  * resources they require in the iomap_begin call, and release them in the
  * iomap_end call.
  */
-static loff_t
+loff_t
 iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 		struct iomap_ops *ops, void *data, iomap_actor_t actor)
 {
@@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+static struct page *
+__iomap_read_page(struct inode *inode, loff_t offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+
+	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+static loff_t
+iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	long status = 0;
+	ssize_t written = 0;
+
+	do {
+		struct page *page, *rpage;
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+
+		offset = (pos & (PAGE_SIZE - 1));
+		bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+
+		rpage = __iomap_read_page(inode, pos);
+		if (IS_ERR(rpage))
+			return PTR_ERR(rpage);
+
+		status = iomap_write_begin(inode, pos, bytes,
+				AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
+				&page, iomap);
+		put_page(rpage);
+		if (unlikely(status))
+			return status;
+
+		WARN_ON_ONCE(!PageUptodate(page));
+
+		status = iomap_write_end(inode, pos, bytes, bytes, page);
+		if (unlikely(status <= 0)) {
+			if (WARN_ON_ONCE(status == 0))
+				return -EIO;
+			return status;
+		}
+
+		cond_resched();
+
+		pos += status;
+		written += status;
+		length -= status;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+	} while (length);
+
+	return written;
+}
+
+int
+iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+		struct iomap_ops *ops)
+{
+	loff_t ret;
+
+	while (len) {
+		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
+				iomap_dirty_actor);
+		if (ret <= 0)
+			return ret;
+		pos += ret;
+		len -= ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_file_dirty);
+
 static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 		unsigned bytes, struct iomap *iomap)
 {
@@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 
 	if (iomap->flags & IOMAP_F_MERGED)
 		flags |= FIEMAP_EXTENT_MERGED;
+	if (iomap->flags & IOMAP_F_SHARED)
+		flags |= FIEMAP_EXTENT_SHARED;
 
 	return fiemap_fill_next_extent(fi, iomap->offset,
 			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 2e58978d6f45..4d973524c887 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2893,8 +2893,7 @@ restart:
 	 * on anon_list2.  Let's check.
 	 */
 	if (!list_empty(&TxAnchor.anon_list2)) {
-		list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
-		INIT_LIST_HEAD(&TxAnchor.anon_list2);
+		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
 		goto restart;
 	}
 	TXN_UNLOCK();
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 90b3bc21e9b0..bd9b641ada2c 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -379,8 +379,14 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 * cached in meta-data cache, and not written out
 	 * by txCommit();
 	 */
-	filemap_fdatawait(ipbmap->i_mapping);
-	filemap_write_and_wait(ipbmap->i_mapping);
+	rc = filemap_fdatawait(ipbmap->i_mapping);
+	if (rc)
+		goto error_out;
+
+	rc = filemap_write_and_wait(ipbmap->i_mapping);
+	if (rc)
+		goto error_out;
+
 	diWriteSpecial(ipbmap, 0);
 
 	newPage = nPages;	/* first new page number */
diff --git a/fs/locks.c b/fs/locks.c
index ee1b15f6fc13..90ec67108b22 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/hashtable.h>
 #include <linux/percpu.h>
-#include <linux/lglock.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/filelock.h>
@@ -158,12 +157,18 @@ int lease_break_time = 45;
 
 /*
  * The global file_lock_list is only used for displaying /proc/locks, so we
- * keep a list on each CPU, with each list protected by its own spinlock via
- * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant flc_lock is held.
+ * keep a list on each CPU, with each list protected by its own spinlock.
+ * Global serialization is done using file_rwsem.
+ *
+ * Note that alterations to the list also require that the relevant flc_lock is
+ * held.
  */
-DEFINE_STATIC_LGLOCK(file_lock_lglock);
-static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+struct file_lock_list_struct {
+	spinlock_t		lock;
+	struct hlist_head	hlist;
+};
+static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
+DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);
 
 /*
  * The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -587,15 +592,23 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 /* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
-	lg_local_lock(&file_lock_lglock);
+	struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
+
+	percpu_rwsem_assert_held(&file_rwsem);
+
+	spin_lock(&fll->lock);
 	fl->fl_link_cpu = smp_processor_id();
-	hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
-	lg_local_unlock(&file_lock_lglock);
+	hlist_add_head(&fl->fl_link, &fll->hlist);
+	spin_unlock(&fll->lock);
 }
 
 /* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
+	struct file_lock_list_struct *fll;
+
+	percpu_rwsem_assert_held(&file_rwsem);
+
 	/*
 	 * Avoid taking lock if already unhashed. This is safe since this check
 	 * is done while holding the flc_lock, and new insertions into the list
@@ -603,9 +616,11 @@ static void locks_delete_global_locks(struct file_lock *fl)
 	 */
 	if (hlist_unhashed(&fl->fl_link))
 		return;
-	lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+
+	fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+	spin_lock(&fll->lock);
 	hlist_del_init(&fl->fl_link);
-	lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+	spin_unlock(&fll->lock);
 }
 
 static unsigned long
@@ -915,6 +930,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 			return -ENOMEM;
 	}
 
+	percpu_down_read_preempt_disable(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
@@ -955,6 +971,7 @@ find_conflict:
 
 out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read_preempt_enable(&file_rwsem);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	locks_dispose_list(&dispose);
@@ -991,6 +1008,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 		new_fl2 = locks_alloc_lock();
 	}
 
+	percpu_down_read_preempt_disable(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	/*
 	 * New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1162,6 +1180,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 	}
  out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read_preempt_enable(&file_rwsem);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1436,6 +1455,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 		return error;
 	}
 
+	percpu_down_read_preempt_disable(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 
 	time_out_leases(inode, &dispose);
@@ -1487,9 +1507,13 @@ restart:
 	locks_insert_block(fl, new_fl);
 	trace_break_lease_block(inode, new_fl);
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read_preempt_enable(&file_rwsem);
+
 	locks_dispose_list(&dispose);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
+
+	percpu_down_read_preempt_disable(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	trace_break_lease_unblock(inode, new_fl);
 	locks_delete_block(new_fl);
@@ -1506,6 +1530,7 @@ restart:
 	}
 out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read_preempt_enable(&file_rwsem);
 	locks_dispose_list(&dispose);
 	locks_free_lock(new_fl);
 	return error;
@@ -1660,6 +1685,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 		return -EINVAL;
 	}
 
+	percpu_down_read_preempt_disable(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
 	error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1730,6 +1756,7 @@ out_setup:
 		lease->fl_lmops->lm_setup(lease, priv);
 out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read_preempt_enable(&file_rwsem);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		inode_unlock(inode);
@@ -1752,6 +1779,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
 		return error;
 	}
 
+	percpu_down_read_preempt_disable(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp &&
@@ -1764,6 +1792,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
 	if (victim)
 		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read_preempt_enable(&file_rwsem);
 	locks_dispose_list(&dispose);
 	return error;
 }
@@ -2574,9 +2603,20 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	struct inode *inode = NULL;
 	unsigned int fl_pid;
 
-	if (fl->fl_nspid)
-		fl_pid = pid_vnr(fl->fl_nspid);
-	else
+	if (fl->fl_nspid) {
+		struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
+
+		/* Don't let fl_pid change based on who is reading the file */
+		fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns);
+
+		/*
+		 * If there isn't a fl_pid don't display who is waiting on
+		 * the lock if we are called from locks_show, or if we are
+		 * called from __show_fd_info - skip lock entirely
+		 */
+		if (fl_pid == 0)
+			return;
+	} else
 		fl_pid = fl->fl_pid;
 
 	if (fl->fl_file != NULL)
@@ -2648,9 +2688,13 @@ static int locks_show(struct seq_file *f, void *v)
 {
 	struct locks_iterator *iter = f->private;
 	struct file_lock *fl, *bfl;
+	struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
 
 	fl = hlist_entry(v, struct file_lock, fl_link);
 
+	if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns))
+		return 0;
+
 	lock_get_status(f, fl, iter->li_pos, "");
 
 	list_for_each_entry(bfl, &fl->fl_block, fl_block)
@@ -2703,9 +2747,9 @@ static void *locks_start(struct seq_file *f, loff_t *pos)
 	struct locks_iterator *iter = f->private;
 
 	iter->li_pos = *pos + 1;
-	lg_global_lock(&file_lock_lglock);
+	percpu_down_write(&file_rwsem);
 	spin_lock(&blocked_lock_lock);
-	return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+	return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
@@ -2713,14 +2757,14 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 	struct locks_iterator *iter = f->private;
 
 	++iter->li_pos;
-	return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
+	return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
 }
 
 static void locks_stop(struct seq_file *f, void *v)
 	__releases(&blocked_lock_lock)
 {
 	spin_unlock(&blocked_lock_lock);
-	lg_global_unlock(&file_lock_lglock);
+	percpu_up_write(&file_rwsem);
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2761,10 +2805,13 @@ static int __init filelock_init(void)
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
-	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
 
-	for_each_possible_cpu(i)
-		INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+	for_each_possible_cpu(i) {
+		struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
+
+		spin_lock_init(&fll->lock);
+		INIT_HLIST_HEAD(&fll->hlist);
+	}
 
 	return 0;
 }
diff --git a/fs/mount.h b/fs/mount.h
index 14db05d424f7..d2e25d7b64b3 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,9 +10,12 @@ struct mnt_namespace {
 	struct mount *	root;
 	struct list_head	list;
 	struct user_namespace	*user_ns;
+	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	u64 event;
+	unsigned int		mounts; /* # of mounts in the namespace */
+	unsigned int		pending_mounts;
 };
 
 struct mnt_pcp {
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda3bfef..db1b5a38864e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,9 @@
 #include "pnode.h"
 #include "internal.h"
 
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
 static unsigned int m_hash_mask __read_mostly;
 static unsigned int m_hash_shift __read_mostly;
 static unsigned int mp_hash_mask __read_mostly;
@@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 
 	list_splice(&head, n->list.prev);
 
+	n->mounts += n->pending_mounts;
+	n->pending_mounts = 0;
+
 	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
@@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		propagate_umount(&tmp_list);
 
 	while (!list_empty(&tmp_list)) {
+		struct mnt_namespace *ns;
 		bool disconnect;
 		p = list_first_entry(&tmp_list, struct mount, mnt_list);
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
-		__touch_mnt_namespace(p->mnt_ns);
+		ns = p->mnt_ns;
+		if (ns) {
+			ns->mounts--;
+			__touch_mnt_namespace(ns);
+		}
 		p->mnt_ns = NULL;
 		if (how & UMOUNT_SYNC)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
@@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
 	return 0;
 }
 
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+	unsigned int max = READ_ONCE(sysctl_mount_max);
+	unsigned int mounts = 0, old, pending, sum;
+	struct mount *p;
+
+	for (p = mnt; p; p = next_mnt(p, mnt))
+		mounts++;
+
+	old = ns->mounts;
+	pending = ns->pending_mounts;
+	sum = old + pending;
+	if ((old > sum) ||
+	    (pending > sum) ||
+	    (max < sum) ||
+	    (mounts > (max - sum)))
+		return -ENOSPC;
+
+	ns->pending_mounts = pending + mounts;
+	return 0;
+}
+
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
@@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 			struct path *parent_path)
 {
 	HLIST_HEAD(tree_list);
+	struct mnt_namespace *ns = dest_mnt->mnt_ns;
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 
+	/* Is there space to add these mounts to the mount namespace? */
+	if (!parent_path) {
+		err = count_mounts(ns, source_mnt);
+		if (err)
+			goto out;
+	}
+
 	if (IS_MNT_SHARED(dest_mnt)) {
 		err = invent_group_ids(source_mnt, true);
 		if (err)
@@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
  out_cleanup_ids:
 	while (!hlist_empty(&tree_list)) {
 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+		child->mnt_parent->mnt_ns->pending_mounts = 0;
 		umount_tree(child, UMOUNT_SYNC);
 	}
 	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
  out:
+	ns->pending_mounts = 0;
 	return err;
 }
 
@@ -2719,9 +2762,20 @@ dput_out:
 	return retval;
 }
 
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
 	ns_free_inum(&ns->ns);
+	dec_mnt_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
@@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
+	struct ucounts *ucounts;
 	int ret;
 
+	ucounts = inc_mnt_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENOSPC);
+
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-	if (!new_ns)
+	if (!new_ns) {
+		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(-ENOMEM);
+	}
 	ret = ns_alloc_inum(&new_ns->ns);
 	if (ret) {
 		kfree(new_ns);
+		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(ret);
 	}
 	new_ns->ns.ops = &mntns_operations;
@@ -2756,6 +2818,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
+	new_ns->mounts = 0;
+	new_ns->pending_mounts = 0;
 	return new_ns;
 }
 
@@ -2805,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	q = new;
 	while (p) {
 		q->mnt_ns = new_ns;
+		new_ns->mounts++;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
@@ -2843,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
 		new_ns->root = mnt;
+		new_ns->mounts++;
 		list_add(&mnt->mnt_list, &new_ns->list);
 	} else {
 		mntput(m);
@@ -3348,10 +3415,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static struct user_namespace *mntns_owner(struct ns_common *ns)
+{
+	return to_mnt_ns(ns)->user_ns;
+}
+
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
 	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
+	.owner		= mntns_owner,
 };
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7d620970f2e1..ca699ddc11c1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	if (result <= 0)
 		goto out;
 
-	written = generic_write_sync(iocb, result);
+	result = generic_write_sync(iocb, result);
+	if (result < 0)
+		goto out;
+	written = result;
 	iocb->ki_pos += written;
 
 	/* Return error values */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f5aecaabcb7c..a9dec32ba9ba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7570,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	trace_nfs4_create_session(clp, status);
 
+	switch (status) {
+	case -NFS4ERR_STALE_CLIENTID:
+	case -NFS4ERR_DELAY:
+	case -ETIMEDOUT:
+	case -EACCES:
+	case -EAGAIN:
+		goto out;
+	};
+
+	clp->cl_seqid++;
 	if (!status) {
 		/* Verify the session's negotiated channel_attrs values */
 		status = nfs4_verify_channel_attrs(&args, &res);
 		/* Increment the clientid slot sequence id */
-		if (clp->cl_seqid == res.seqid)
-			clp->cl_seqid++;
 		if (status)
 			goto out;
 		nfs4_update_session(session, &res);
@@ -8190,10 +8198,13 @@ static void nfs4_layoutreturn_release(void *calldata)
 
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
-	pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
-			be32_to_cpu(lrp->args.stateid.seqid));
-	if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
+	if (lrp->res.lrs_present) {
+		pnfs_mark_matching_lsegs_invalid(lo, &freeme,
+				&lrp->args.range,
+				be32_to_cpu(lrp->args.stateid.seqid));
 		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+	} else
+		pnfs_mark_layout_stateid_invalid(lo, &freeme);
 	pnfs_clear_layoutreturn_waitbit(lo);
 	spin_unlock(&lo->plh_inode->i_lock);
 	nfs4_sequence_free_slot(&lrp->res.seq_res);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6daf034645c8..2c93a85eda51 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
 	atomic_dec(&lo->plh_refcount);
 	if (list_empty(&lo->plh_segs)) {
-		set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+		if (atomic_read(&lo->plh_outstanding) == 0)
+			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 	}
 	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
@@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
 	pnfs_destroy_layouts_byclid(clp, false);
 }
 
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+	lo->plh_return_iomode = 0;
+	lo->plh_return_seq = 0;
+	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 			bool update_barrier)
 {
 	u32 oldseq, newseq, new_barrier = 0;
-	bool invalid = !pnfs_layout_is_valid(lo);
 
 	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 	newseq = be32_to_cpu(new->seqid);
-	if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
+
+	if (!pnfs_layout_is_valid(lo)) {
+		nfs4_stateid_copy(&lo->plh_stateid, new);
+		lo->plh_barrier = newseq;
+		pnfs_clear_layoutreturn_info(lo);
+		clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+		return;
+	}
+	if (pnfs_seqid_is_newer(newseq, oldseq)) {
 		nfs4_stateid_copy(&lo->plh_stateid, new);
 		/*
 		 * Because of wraparound, we want to keep the barrier
@@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 		new_barrier = be32_to_cpu(new->seqid);
 	else if (new_barrier == 0)
 		return;
-	if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+	if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 		lo->plh_barrier = new_barrier;
 }
 
@@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
 
-static void
-pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
-{
-	lo->plh_return_iomode = 0;
-	lo->plh_return_seq = 0;
-	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-}
-
 static bool
 pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
 		nfs4_stateid *stateid,
 		enum pnfs_iomode *iomode)
 {
+	/* Serialise LAYOUTGET/LAYOUTRETURN */
+	if (atomic_read(&lo->plh_outstanding) != 0)
+		return false;
 	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 		return false;
 	pnfs_get_layout_hdr(lo);
@@ -1798,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		 */
 		pnfs_mark_layout_stateid_invalid(lo, &free_me);
 
-		nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
-		lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+		pnfs_set_layout_stateid(lo, &res->stateid, true);
 	}
 
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg, &free_me);
-	if (!pnfs_layout_is_valid(lo)) {
-		pnfs_clear_layoutreturn_info(lo);
-		clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-	}
 
 
 	if (res->return_on_close)
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d2f97ecca6a5..e0e5f7c3c99f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	wait_event(group->fanotify_data.access_waitq, event->response ||
-				atomic_read(&group->fanotify_data.bypass_perm));
-
-	if (!event->response) {	/* bypass_perm set */
-		/*
-		 * Event was canceled because group is being destroyed. Remove
-		 * it from group's event list because we are responsible for
-		 * freeing the permission event.
-		 */
-		fsnotify_remove_event(group, &event->fae.fse);
-		return 0;
-	}
+	wait_event(group->fanotify_data.access_waitq, event->response);
 
 	/* userspace responded, convert to something usable */
 	switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	struct fanotify_perm_event_info *event, *next;
+	struct fsnotify_event *fsn_event;
 
 	/*
-	 * There may be still new events arriving in the notification queue
-	 * but since userspace cannot use fanotify fd anymore, no event can
-	 * enter or leave access_list by now.
+	 * Stop new events from arriving in the notification queue. since
+	 * userspace cannot use fanotify fd anymore, no event can enter or
+	 * leave access_list by now either.
 	 */
-	spin_lock(&group->fanotify_data.access_lock);
-
-	atomic_inc(&group->fanotify_data.bypass_perm);
+	fsnotify_group_stop_queueing(group);
 
+	/*
+	 * Process all permission events on access_list and notification queue
+	 * and simulate reply from userspace.
+	 */
+	spin_lock(&group->fanotify_data.access_lock);
 	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
 				 fae.fse.list) {
 		pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	spin_unlock(&group->fanotify_data.access_lock);
 
 	/*
-	 * Since bypass_perm is set, newly queued events will not wait for
-	 * access response. Wake up the already sleeping ones now.
-	 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
-	 * processes sleeping in fanotify_handle_event() waiting for access
-	 * response and thus also for all permission events to be freed.
+	 * Destroy all non-permission events. For permission events just
+	 * dequeue them and set the response. They will be freed once the
+	 * response is consumed and fanotify_get_response() returns.
 	 */
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		fsn_event = fsnotify_remove_first_event(group);
+		if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+			fsnotify_destroy_event(group, fsn_event);
+		else
+			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+	}
+	mutex_unlock(&group->notification_mutex);
+
+	/* Response for all permission events it set, wakeup waiters */
 	wake_up(&group->fanotify_data.access_waitq);
 #endif
 
@@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	spin_lock_init(&group->fanotify_data.access_lock);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
-	atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
 	switch (flags & FAN_ALL_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 3e2dd85be5dd..b47f7cfdcaa4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 }
 
 /*
+ * Stop queueing new events for this group. Once this function returns
+ * fsnotify_add_event() will not add any new events to the group's queue.
+ */
+void fsnotify_group_stop_queueing(struct fsnotify_group *group)
+{
+	mutex_lock(&group->notification_mutex);
+	group->shutdown = true;
+	mutex_unlock(&group->notification_mutex);
+}
+
+/*
  * Trying to get rid of a group. Remove all marks, flush all events and release
  * the group reference.
  * Note that another thread calling fsnotify_clear_marks_by_group() may still
@@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
+	/*
+	 * Stop queueing new events. The code below is careful enough to not
+	 * require this but fanotify needs to stop queuing events even before
+	 * fsnotify_destroy_group() is called and this makes the other callers
+	 * of fsnotify_destroy_group() to see the same behavior.
+	 */
+	fsnotify_group_stop_queueing(group);
+
 	/* clear all inode marks for this group, attach them to destroy_list */
 	fsnotify_detach_group_marks(group);
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index a95d8e037aeb..e455e83ceeeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
  * Add an event to the group notification queue.  The group can later pull this
  * event off the queue to deal with.  The function returns 0 if the event was
  * added to the queue, 1 if the event was merged with some other queued event,
- * 2 if the queue of events has overflown.
+ * 2 if the event was not queued - either the queue of events has overflown
+ * or the group is shutting down.
  */
 int fsnotify_add_event(struct fsnotify_group *group,
 		       struct fsnotify_event *event,
@@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group,
 
 	mutex_lock(&group->notification_mutex);
 
+	if (group->shutdown) {
+		mutex_unlock(&group->notification_mutex);
+		return 2;
+	}
+
 	if (group->q_len >= group->max_events) {
 		ret = 2;
 		/* Queue overflow event only if it isn't already queued */
@@ -126,21 +132,6 @@ queue:
 }
 
 /*
- * Remove @event from group's notification queue. It is the responsibility of
- * the caller to destroy the event.
- */
-void fsnotify_remove_event(struct fsnotify_group *group,
-			   struct fsnotify_event *event)
-{
-	mutex_lock(&group->notification_mutex);
-	if (!list_empty(&event->list)) {
-		list_del_init(&event->list);
-		group->q_len--;
-	}
-	mutex_unlock(&group->notification_mutex);
-}
-
-/*
  * Remove and return the first event from the notification list.  It is the
  * responsibility of the caller to destroy the obtained event
  */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..30bb10034120 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -5,11 +5,16 @@
 #include <linux/magic.h>
 #include <linux/ktime.h>
 #include <linux/seq_file.h>
+#include <linux/user_namespace.h>
+#include <linux/nsfs.h>
 
 static struct vfsmount *nsfs_mnt;
 
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+			unsigned long arg);
 static const struct file_operations ns_file_operations = {
 	.llseek		= no_llseek,
+	.unlocked_ioctl = ns_ioctl,
 };
 
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-void *ns_get_path(struct path *path, struct task_struct *task,
-			const struct proc_ns_operations *ns_ops)
+static void *__ns_get_path(struct path *path, struct ns_common *ns)
 {
-	struct vfsmount *mnt = mntget(nsfs_mnt);
+	struct vfsmount *mnt = nsfs_mnt;
 	struct qstr qname = { .name = "", };
 	struct dentry *dentry;
 	struct inode *inode;
-	struct ns_common *ns;
 	unsigned long d;
 
-again:
-	ns = ns_ops->get(task);
-	if (!ns) {
-		mntput(mnt);
-		return ERR_PTR(-ENOENT);
-	}
 	rcu_read_lock();
 	d = atomic_long_read(&ns->stashed);
 	if (!d)
@@ -68,17 +65,16 @@ again:
 	if (!lockref_get_not_dead(&dentry->d_lockref))
 		goto slow;
 	rcu_read_unlock();
-	ns_ops->put(ns);
+	ns->ops->put(ns);
 got_it:
-	path->mnt = mnt;
+	path->mnt = mntget(mnt);
 	path->dentry = dentry;
 	return NULL;
 slow:
 	rcu_read_unlock();
 	inode = new_inode_pseudo(mnt->mnt_sb);
 	if (!inode) {
-		ns_ops->put(ns);
-		mntput(mnt);
+		ns->ops->put(ns);
 		return ERR_PTR(-ENOMEM);
 	}
 	inode->i_ino = ns->inum;
@@ -91,21 +87,96 @@ slow:
 	dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
 	if (!dentry) {
 		iput(inode);
-		mntput(mnt);
 		return ERR_PTR(-ENOMEM);
 	}
 	d_instantiate(dentry, inode);
-	dentry->d_fsdata = (void *)ns_ops;
+	dentry->d_fsdata = (void *)ns->ops;
 	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
 	if (d) {
 		d_delete(dentry);	/* make sure ->d_prune() does nothing */
 		dput(dentry);
 		cpu_relax();
-		goto again;
+		return ERR_PTR(-EAGAIN);
 	}
 	goto got_it;
 }
 
+void *ns_get_path(struct path *path, struct task_struct *task,
+			const struct proc_ns_operations *ns_ops)
+{
+	struct ns_common *ns;
+	void *ret;
+
+again:
+	ns = ns_ops->get(task);
+	if (!ns)
+		return ERR_PTR(-ENOENT);
+
+	ret = __ns_get_path(path, ns);
+	if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
+		goto again;
+	return ret;
+}
+
+static int open_related_ns(struct ns_common *ns,
+		   struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+	struct path path = {};
+	struct file *f;
+	void *err;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	while (1) {
+		struct ns_common *relative;
+
+		relative = get_ns(ns);
+		if (IS_ERR(relative)) {
+			put_unused_fd(fd);
+			return PTR_ERR(relative);
+		}
+
+		err = __ns_get_path(&path, relative);
+		if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
+			continue;
+		break;
+	}
+	if (IS_ERR(err)) {
+		put_unused_fd(fd);
+		return PTR_ERR(err);
+	}
+
+	f = dentry_open(&path, O_RDONLY, current_cred());
+	path_put(&path);
+	if (IS_ERR(f)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(f);
+	} else
+		fd_install(fd, f);
+
+	return fd;
+}
+
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+			unsigned long arg)
+{
+	struct ns_common *ns = get_proc_ns(file_inode(filp));
+
+	switch (ioctl) {
+	case NS_GET_USERNS:
+		return open_related_ns(ns, ns_get_owner);
+	case NS_GET_PARENT:
+		if (!ns->ops->get_parent)
+			return -EINVAL;
+		return open_related_ns(ns, ns->ops->get_parent);
+	default:
+		return -ENOTTY;
+	}
+}
+
 int ns_get_name(char *buf, size_t size, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops)
 {
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7dabbc31060e..f165f867f332 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5922,7 +5922,6 @@ bail:
 }
 
 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
-					 handle_t *handle,
 					 struct inode *data_alloc_inode,
 					 struct buffer_head *data_alloc_bh)
 {
@@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 	struct ocfs2_truncate_log *tl;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	handle_t *handle;
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 	tl = &di->id2.i_dealloc;
 	i = le16_to_cpu(tl->tl_used) - 1;
 	while (i >= 0) {
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail;
+		}
+
 		/* Caller has given us at least enough credits to
 		 * update the truncate log dinode */
 		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
@@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 			}
 		}
 
-		status = ocfs2_extend_trans(handle,
-				OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
+		ocfs2_commit_trans(osb, handle);
 		i--;
 	}
 
@@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
 	int status;
 	unsigned int num_to_flush;
-	handle_t *handle;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct inode *data_alloc_inode = NULL;
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 		goto out_mutex;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out_unlock;
-	}
-
-	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+	status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
 					       data_alloc_bh);
 	if (status < 0)
 		mlog_errno(status);
 
-	ocfs2_commit_trans(osb, handle);
-
-out_unlock:
 	brelse(data_alloc_bh);
 	ocfs2_inode_unlock(data_alloc_inode, 1);
 
@@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
 		goto out_mutex;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-
 	while (head) {
 		if (head->free_bg)
 			bg_blkno = head->free_bg;
 		else
 			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
 							      head->free_bit);
+		handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
 		trace_ocfs2_free_cached_blocks(
 		     (unsigned long long)head->free_blk, head->free_bit);
 
 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
 					       head->free_bit, bg_blkno, 1);
-		if (ret) {
+		if (ret)
 			mlog_errno(ret);
-			goto out_journal;
-		}
 
-		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_journal;
-		}
+		ocfs2_commit_trans(osb, handle);
 
 		tmp = head;
 		head = head->free_next;
 		kfree(tmp);
 	}
 
-out_journal:
-	ocfs2_commit_trans(osb, handle);
-
 out_unlock:
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 98d36548153d..bbb4b3e5b4ff 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1842,6 +1842,16 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out:
+	/*
+	 * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(),
+	 * even in case of error here like ENOSPC and ENOMEM. So, we need
+	 * to unlock the target page manually to prevent deadlocks when
+	 * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED
+	 * to VM code.
+	 */
+	if (wc->w_target_locked)
+		unlock_page(mmap_page);
+
 	ocfs2_free_write_ctxt(inode, wc);
 
 	if (data_ac) {
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 94b18369b1cc..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,9 +44,6 @@
  * version here in tcp_internal.h should not need to be bumped for
  * filesystem locking changes.
  *
- * New in version 12
- *	- Negotiate hb timeout when storage is down.
- *
  * New in version 11
  * 	- Negotiation of filesystem locking in the dlm join.
  *
@@ -78,7 +75,7 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 12ULL
+#define O2NET_PROTOCOL_VERSION 11ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index cdeafb4e7ed6..0bb128659d4b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 				  struct dlm_lock *lock, int flags, int type)
 {
 	enum dlm_status status;
-	u8 old_owner = res->owner;
 
 	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
 	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 
 	spin_lock(&res->spinlock);
 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
-	lock->convert_pending = 0;
 	/* if it failed, move it back to granted queue.
 	 * if master returns DLM_NORMAL and then down before sending ast,
 	 * it may have already been moved to granted queue, reset to
@@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 		if (status != DLM_NOTQUEUED)
 			dlm_error(status);
 		dlm_revert_pending_convert(res, lock);
-	} else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
-			(old_owner != res->owner)) {
-		mlog(0, "res %.*s is in recovering or has been recovered.\n",
-				res->lockname.len, res->lockname.name);
+	} else if (!lock->convert_pending) {
+		mlog(0, "%s: res %.*s, owner died and lock has been moved back "
+				"to granted list, retry convert.\n",
+				dlm->name, res->lockname.len, res->lockname.name);
 		status = DLM_RECOVERING;
 	}
+
+	lock->convert_pending = 0;
 bail:
 	spin_unlock(&res->spinlock);
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4e7b0dc22450..0b055bfb8e86 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 				       u64 start, u64 len)
 {
 	int ret = 0;
-	u64 tmpend, end = start + len;
+	u64 tmpend = 0;
+	u64 end = start + len;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	unsigned int csize = osb->s_clustersize;
 	handle_t *handle;
@@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 	}
 
 	/*
-	 * We want to get the byte offset of the end of the 1st cluster.
+	 * If start is on a cluster boundary and end is somewhere in another
+	 * cluster, we have not COWed the cluster starting at start, unless
+	 * end is also within the same cluster. So, in this case, we skip this
+	 * first call to ocfs2_zero_range_for_truncate() truncate and move on
+	 * to the next one.
 	 */
-	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
-	if (tmpend > end)
-		tmpend = end;
+	if ((start & (csize - 1)) != 0) {
+		/*
+		 * We want to get the byte offset of the end of the 1st
+		 * cluster.
+		 */
+		tmpend = (u64)osb->s_clustersize +
+			(start & ~(osb->s_clustersize - 1));
+		if (tmpend > end)
+			tmpend = end;
 
-	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
-						 (unsigned long long)tmpend);
+		trace_ocfs2_zero_partial_clusters_range1(
+			(unsigned long long)start,
+			(unsigned long long)tmpend);
 
-	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
-	if (ret)
-		mlog_errno(ret);
+		ret = ocfs2_zero_range_for_truncate(inode, handle, start,
+						    tmpend);
+		if (ret)
+			mlog_errno(ret);
+	}
 
 	if (tmpend < end) {
 		/*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ea47120a85ff..6ad3533940ba 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1199,14 +1199,24 @@ retry:
 			inode_unlock((*ac)->ac_inode);
 
 			ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
-			if (ret == 1)
+			if (ret == 1) {
+				iput((*ac)->ac_inode);
+				(*ac)->ac_inode = NULL;
 				goto retry;
+			}
 
 			if (ret < 0)
 				mlog_errno(ret);
 
 			inode_lock((*ac)->ac_inode);
-			ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+			ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+			if (ret < 0) {
+				mlog_errno(ret);
+				inode_unlock((*ac)->ac_inode);
+				iput((*ac)->ac_inode);
+				(*ac)->ac_inode = NULL;
+				goto bail;
+			}
 		}
 		if (status < 0) {
 			if (status != -ENOSPC)
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 00235bf644dc..1e8fe844e69f 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
 		}
 	}
 
-	dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+	dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
 	ret = 1;
 out_release_op:
 	op_release(new_op);
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index a287a66d94e3..516ffb4dc9a0 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -11,14 +11,19 @@
 #include "orangefs-kernel.h"
 #include "orangefs-dev-proto.h"
 #include "orangefs-bufmap.h"
+#include "orangefs-debugfs.h"
 
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 
 /* this file implements the /dev/pvfs2-req device node */
 
+uint32_t orangefs_userspace_version;
+
 static int open_access_count;
 
+static DEFINE_MUTEX(devreq_mutex);
+
 #define DUMP_DEVICE_ERROR()                                                   \
 do {                                                                          \
 	gossip_err("*****************************************************\n");\
@@ -43,7 +48,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
 {
 	int index = hash_func(op->tag, hash_table_size);
 
-	list_add_tail(&op->list, &htable_ops_in_progress[index]);
+	list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]);
 }
 
 /*
@@ -57,20 +62,20 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
 
 	index = hash_func(tag, hash_table_size);
 
-	spin_lock(&htable_ops_in_progress_lock);
+	spin_lock(&orangefs_htable_ops_in_progress_lock);
 	list_for_each_entry_safe(op,
 				 next,
-				 &htable_ops_in_progress[index],
+				 &orangefs_htable_ops_in_progress[index],
 				 list) {
 		if (op->tag == tag && !op_state_purged(op) &&
 		    !op_state_given_up(op)) {
 			list_del_init(&op->list);
-			spin_unlock(&htable_ops_in_progress_lock);
+			spin_unlock(&orangefs_htable_ops_in_progress_lock);
 			return op;
 		}
 	}
 
-	spin_unlock(&htable_ops_in_progress_lock);
+	spin_unlock(&orangefs_htable_ops_in_progress_lock);
 	return NULL;
 }
 
@@ -276,11 +281,11 @@ restart:
 	if (ret != 0)
 		goto error;
 
-	spin_lock(&htable_ops_in_progress_lock);
+	spin_lock(&orangefs_htable_ops_in_progress_lock);
 	spin_lock(&cur_op->lock);
 	if (unlikely(op_state_given_up(cur_op))) {
 		spin_unlock(&cur_op->lock);
-		spin_unlock(&htable_ops_in_progress_lock);
+		spin_unlock(&orangefs_htable_ops_in_progress_lock);
 		complete(&cur_op->waitq);
 		goto restart;
 	}
@@ -298,7 +303,7 @@ restart:
 		     current->comm);
 	orangefs_devreq_add_op(cur_op);
 	spin_unlock(&cur_op->lock);
-	spin_unlock(&htable_ops_in_progress_lock);
+	spin_unlock(&orangefs_htable_ops_in_progress_lock);
 
 	/* The client only asks to read one size buffer. */
 	return MAX_DEV_REQ_UPSIZE;
@@ -387,6 +392,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		return -EPROTO;
 	}
 
+	if (!orangefs_userspace_version) {
+		orangefs_userspace_version = head.version;
+	} else if (orangefs_userspace_version != head.version) {
+		gossip_err("Error: userspace version changes\n");
+		return -EPROTO;
+	}
+
 	/* remove the op from the in progress hash table */
 	op = orangefs_devreq_remove_op(head.tag);
 	if (!op) {
@@ -527,6 +539,7 @@ static int orangefs_devreq_release(struct inode *inode, struct file *file)
 	gossip_debug(GOSSIP_DEV_DEBUG,
 		     "pvfs2-client-core: device close complete\n");
 	open_access_count = 0;
+	orangefs_userspace_version = 0;
 	mutex_unlock(&devreq_mutex);
 	return 0;
 }
@@ -576,8 +589,6 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
 	static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
 	struct ORANGEFS_dev_map_desc user_desc;
 	int ret = 0;
-	struct dev_mask_info_s mask_info = { 0 };
-	struct dev_mask2_info_s mask2_info = { 0, 0 };
 	int upstream_kmod = 1;
 	struct orangefs_sb_info_s *orangefs_sb;
 
@@ -619,7 +630,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
 		 * all of the remounts are serviced (to avoid ops between
 		 * mounts to fail)
 		 */
-		ret = mutex_lock_interruptible(&request_mutex);
+		ret = mutex_lock_interruptible(&orangefs_request_mutex);
 		if (ret < 0)
 			return ret;
 		gossip_debug(GOSSIP_DEV_DEBUG,
@@ -654,7 +665,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
 		gossip_debug(GOSSIP_DEV_DEBUG,
 			     "%s: priority remount complete\n",
 			     __func__);
-		mutex_unlock(&request_mutex);
+		mutex_unlock(&orangefs_request_mutex);
 		return ret;
 
 	case ORANGEFS_DEV_UPSTREAM:
@@ -668,134 +679,11 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
 			return ret;
 
 	case ORANGEFS_DEV_CLIENT_MASK:
-		ret = copy_from_user(&mask2_info,
-				     (void __user *)arg,
-				     sizeof(struct dev_mask2_info_s));
-
-		if (ret != 0)
-			return -EIO;
-
-		client_debug_mask.mask1 = mask2_info.mask1_value;
-		client_debug_mask.mask2 = mask2_info.mask2_value;
-
-		pr_info("%s: client debug mask has been been received "
-			":%llx: :%llx:\n",
-			__func__,
-			(unsigned long long)client_debug_mask.mask1,
-			(unsigned long long)client_debug_mask.mask2);
-
-		return ret;
-
+		return orangefs_debugfs_new_client_mask((void __user *)arg);
 	case ORANGEFS_DEV_CLIENT_STRING:
-		ret = copy_from_user(&client_debug_array_string,
-				     (void __user *)arg,
-				     ORANGEFS_MAX_DEBUG_STRING_LEN);
-		/*
-		 * The real client-core makes an effort to ensure
-		 * that actual strings that aren't too long to fit in
-		 * this buffer is what we get here. We're going to use
-		 * string functions on the stuff we got, so we'll make
-		 * this extra effort to try and keep from
-		 * flowing out of this buffer when we use the string
-		 * functions, even if somehow the stuff we end up
-		 * with here is garbage.
-		 */
-		client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
-			'\0';
-		
-		if (ret != 0) {
-			pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
-				__func__);
-			return -EIO;
-		}
-
-		pr_info("%s: client debug array string has been received.\n",
-			__func__);
-
-		if (!help_string_initialized) {
-
-			/* Free the "we don't know yet" default string... */
-			kfree(debug_help_string);
-
-			/* build a proper debug help string */
-			if (orangefs_prepare_debugfs_help_string(0)) {
-				gossip_err("%s: no debug help string \n",
-					   __func__);
-				return -EIO;
-			}
-
-			/* Replace the boilerplate boot-time debug-help file. */
-			debugfs_remove(help_file_dentry);
-
-			help_file_dentry =
-				debugfs_create_file(
-					ORANGEFS_KMOD_DEBUG_HELP_FILE,
-					0444,
-					debug_dir,
-					debug_help_string,
-					&debug_help_fops);
-
-			if (!help_file_dentry) {
-				gossip_err("%s: debugfs_create_file failed for"
-					   " :%s:!\n",
-					   __func__,
-					   ORANGEFS_KMOD_DEBUG_HELP_FILE);
-				return -EIO;
-			}
-		}
-
-		debug_mask_to_string(&client_debug_mask, 1);
-
-		debugfs_remove(client_debug_dentry);
-
-		orangefs_client_debug_init();
-
-		help_string_initialized++;
-
-		return ret;
-
+		return orangefs_debugfs_new_client_string((void __user *)arg);
 	case ORANGEFS_DEV_DEBUG:
-		ret = copy_from_user(&mask_info,
-				     (void __user *)arg,
-				     sizeof(mask_info));
-
-		if (ret != 0)
-			return -EIO;
-
-		if (mask_info.mask_type == KERNEL_MASK) {
-			if ((mask_info.mask_value == 0)
-			    && (kernel_mask_set_mod_init)) {
-				/*
-				 * the kernel debug mask was set when the
-				 * kernel module was loaded; don't override
-				 * it if the client-core was started without
-				 * a value for ORANGEFS_KMODMASK.
-				 */
-				return 0;
-			}
-			debug_mask_to_string(&mask_info.mask_value,
-					     mask_info.mask_type);
-			gossip_debug_mask = mask_info.mask_value;
-			pr_info("%s: kernel debug mask has been modified to "
-				":%s: :%llx:\n",
-				__func__,
-				kernel_debug_string,
-				(unsigned long long)gossip_debug_mask);
-		} else if (mask_info.mask_type == CLIENT_MASK) {
-			debug_mask_to_string(&mask_info.mask_value,
-					     mask_info.mask_type);
-			pr_info("%s: client debug mask has been modified to"
-				":%s: :%llx:\n",
-				__func__,
-				client_debug_string,
-				llu(mask_info.mask_value));
-		} else {
-			gossip_lerr("Invalid mask type....\n");
-			return -EINVAL;
-		}
-
-		return ret;
-
+		return orangefs_debugfs_new_debug((void __user *)arg);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 66b99210f1f9..3b8923f8bf21 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -83,7 +83,10 @@ struct orangefs_listxattr_response {
 };
 
 struct orangefs_param_response {
-	__s64 value;
+	union {
+		__s64 value64;
+		__s32 value32[2];
+	} u;
 };
 
 #define PERF_COUNT_BUF_SIZE 4096
@@ -98,6 +101,11 @@ struct orangefs_fs_key_response {
 	char fs_key[FS_KEY_BUF_SIZE];
 };
 
+/* 2.9.6 */
+struct orangefs_features_response {
+	__u64 features;
+};
+
 struct orangefs_downcall_s {
 	__s32 type;
 	__s32 status;
@@ -119,6 +127,7 @@ struct orangefs_downcall_s {
 		struct orangefs_param_response param;
 		struct orangefs_perf_count_response perf_count;
 		struct orangefs_fs_key_response fs_key;
+		struct orangefs_features_response features;
 	} resp;
 };
 
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 526040e09f78..3386886596d6 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -14,6 +14,32 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 
+static int flush_racache(struct inode *inode)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct orangefs_kernel_op_s *new_op;
+	int ret;
+
+	gossip_debug(GOSSIP_UTILS_DEBUG,
+	    "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
+	    get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
+	    orangefs_inode->refn.fs_id);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
+	if (!new_op)
+		return -ENOMEM;
+	new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
+
+	ret = service_operation(new_op, "orangefs_flush_racache",
+	    get_interruptible_flag(inode));
+
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
+	    __func__, ret);
+
+	op_release(new_op);
+	return ret;
+}
+
 /*
  * Copy to client-core's address space from the buffers specified
  * by the iovec upto total_size bytes.
@@ -386,7 +412,7 @@ ssize_t orangefs_inode_read(struct inode *inode,
 	size_t bufmap_size;
 	ssize_t ret = -EINVAL;
 
-	g_orangefs_stats.reads++;
+	orangefs_stats.reads++;
 
 	bufmap_size = orangefs_bufmap_size_query();
 	if (count > bufmap_size) {
@@ -427,7 +453,7 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
 
 	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
 
-	g_orangefs_stats.reads++;
+	orangefs_stats.reads++;
 
 	rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
 	iocb->ki_pos = pos;
@@ -488,7 +514,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
 	}
 
 	iocb->ki_pos = pos;
-	g_orangefs_stats.writes++;
+	orangefs_stats.writes++;
 
 out:
 
@@ -591,15 +617,24 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
 	orangefs_flush_inode(inode);
 
 	/*
-	 * remove all associated inode pages from the page cache and mmap
+	 * remove all associated inode pages from the page cache and
 	 * readahead cache (if any); this forces an expensive refresh of
 	 * data for the next caller of mmap (or 'get_block' accesses)
 	 */
 	if (file->f_path.dentry->d_inode &&
 	    file->f_path.dentry->d_inode->i_mapping &&
-	    mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
+	    mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) {
+		if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
+			gossip_debug(GOSSIP_INODE_DEBUG,
+			    "calling flush_racache on %pU\n",
+			    get_khandle_from_ino(inode));
+			flush_racache(inode);
+			gossip_debug(GOSSIP_INODE_DEBUG,
+			    "flush_racache finished\n");
+		}
 		truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
 				     0);
+	}
 	return 0;
 }
 
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 62c525936ee8..35269e31de92 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -72,7 +72,7 @@ static int orangefs_create(struct inode *dir,
 
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
-	dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+	dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
@@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
 		goto out;
 	}
 
-	dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+	dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
 
 	inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
 	if (IS_ERR(inode)) {
@@ -322,7 +322,7 @@ static int orangefs_symlink(struct inode *dir,
 
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
-	dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+	dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
@@ -386,7 +386,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
-	dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+	dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index b6edbe9fb309..aa3830b741c7 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
 			return "OP_STATFS";
 		else if (type == ORANGEFS_VFS_OP_TRUNCATE)
 			return "OP_TRUNCATE";
-		else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
-			return "OP_MMAP_RA_FLUSH";
+		else if (type == ORANGEFS_VFS_OP_RA_FLUSH)
+			return "OP_RA_FLUSH";
 		else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
 			return "OP_FS_MOUNT";
 		else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
@@ -97,6 +97,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
 			return "OP_FSYNC";
 		else if (type == ORANGEFS_VFS_OP_FSKEY)
 			return "OP_FSKEY";
+		else if (type == ORANGEFS_VFS_OP_FEATURES)
+			return "OP_FEATURES";
 	}
 	return "OP_UNKNOWN?";
 }
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1714a737d556..9b24107c82a8 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -43,36 +43,35 @@
 #include "protocol.h"
 #include "orangefs-kernel.h"
 
-static int orangefs_debug_disabled = 1;
-
-static int orangefs_debug_help_open(struct inode *, struct file *);
+#define DEBUG_HELP_STRING_SIZE 4096
+#define HELP_STRING_UNINITIALIZED \
+	"Client Debug Keywords are unknown until the first time\n" \
+	"the client is started after boot.\n"
+#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
+#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
+#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
+#define ORANGEFS_VERBOSE "verbose"
+#define ORANGEFS_ALL "all"
 
-const struct file_operations debug_help_fops = {
-	.open           = orangefs_debug_help_open,
-	.read           = seq_read,
-	.release        = seq_release,
-	.llseek         = seq_lseek,
+/*
+ * An array of client_debug_mask will be built to hold debug keyword/mask
+ * values fetched from userspace.
+ */
+struct client_debug_mask {
+	char *keyword;
+	__u64 mask1;
+	__u64 mask2;
 };
 
+static int orangefs_kernel_debug_init(void);
+
+static int orangefs_debug_help_open(struct inode *, struct file *);
 static void *help_start(struct seq_file *, loff_t *);
 static void *help_next(struct seq_file *, void *, loff_t *);
 static void help_stop(struct seq_file *, void *);
 static int help_show(struct seq_file *, void *);
 
-static const struct seq_operations help_debug_ops = {
-	.start	= help_start,
-	.next	= help_next,
-	.stop	= help_stop,
-	.show	= help_show,
-};
-
-/*
- * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
- * ORANGEFS_KMOD_DEBUG_FILE.
- */
-static DEFINE_MUTEX(orangefs_debug_lock);
-
-int orangefs_debug_open(struct inode *, struct file *);
+static int orangefs_debug_open(struct inode *, struct file *);
 
 static ssize_t orangefs_debug_read(struct file *,
 				 char __user *,
@@ -84,6 +83,43 @@ static ssize_t orangefs_debug_write(struct file *,
 				  size_t,
 				  loff_t *);
 
+static int orangefs_prepare_cdm_array(char *);
+static void debug_mask_to_string(void *, int);
+static void do_k_string(void *, int);
+static void do_c_string(void *, int);
+static int keyword_is_amalgam(char *);
+static int check_amalgam_keyword(void *, int);
+static void debug_string_to_mask(char *, void *, int);
+static void do_c_mask(int, char *, struct client_debug_mask **);
+static void do_k_mask(int, char *, __u64 **);
+
+static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
+static char *debug_help_string;
+static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+
+static struct dentry *help_file_dentry;
+static struct dentry *client_debug_dentry;
+static struct dentry *debug_dir;
+
+static unsigned int kernel_mask_set_mod_init;
+static int orangefs_debug_disabled = 1;
+static int help_string_initialized;
+
+static const struct seq_operations help_debug_ops = {
+	.start	= help_start,
+	.next	= help_next,
+	.stop	= help_stop,
+	.show	= help_show,
+};
+
+const struct file_operations debug_help_fops = {
+	.open           = orangefs_debug_help_open,
+	.read           = seq_read,
+	.release        = seq_release,
+	.llseek         = seq_lseek,
+};
+
 static const struct file_operations kernel_debug_fops = {
 	.open           = orangefs_debug_open,
 	.read           = orangefs_debug_read,
@@ -91,15 +127,55 @@ static const struct file_operations kernel_debug_fops = {
 	.llseek         = generic_file_llseek,
 };
 
+static int client_all_index;
+static int client_verbose_index;
+
+static struct client_debug_mask *cdm_array;
+static int cdm_element_count;
+
+static struct client_debug_mask client_debug_mask;
+
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+
 /*
  * initialize kmod debug operations, create orangefs debugfs dir and
  * ORANGEFS_KMOD_DEBUG_HELP_FILE.
  */
-int orangefs_debugfs_init(void)
+int orangefs_debugfs_init(int debug_mask)
 {
-
 	int rc = -ENOMEM;
 
+	/* convert input debug mask to a 64-bit unsigned integer */
+        orangefs_gossip_debug_mask = (unsigned long long)debug_mask;
+
+	/*
+	 * set the kernel's gossip debug string; invalid mask values will
+	 * be ignored.
+	 */
+	debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
+
+	/* remove any invalid values from the mask */
+	debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask,
+	    0);
+
+	/*
+	 * if the mask has a non-zero value, then indicate that the mask
+	 * was set when the kernel module was loaded.  The orangefs dev ioctl
+	 * command will look at this boolean to determine if the kernel's
+	 * debug mask should be overwritten when the client-core is started.
+	 */
+	if (orangefs_gossip_debug_mask != 0)
+		kernel_mask_set_mod_init = true;
+
+	pr_info("%s: called with debug mask: :%s: :%llx:\n",
+		__func__,
+		kernel_debug_string,
+		(unsigned long long)orangefs_gossip_debug_mask);
+
 	debug_dir = debugfs_create_dir("orangefs", NULL);
 	if (!debug_dir) {
 		pr_info("%s: debugfs_create_dir failed.\n", __func__);
@@ -117,13 +193,58 @@ int orangefs_debugfs_init(void)
 	}
 
 	orangefs_debug_disabled = 0;
+
+	rc = orangefs_kernel_debug_init();
+
+out:
+
+	return rc;
+}
+
+/*
+ * initialize the kernel-debug file.
+ */
+static int orangefs_kernel_debug_init(void)
+{
+	int rc = -ENOMEM;
+	struct dentry *ret;
+	char *k_buffer = NULL;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+	k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+	if (!k_buffer)
+		goto out;
+
+	if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+		strcpy(k_buffer, kernel_debug_string);
+		strcat(k_buffer, "\n");
+	} else {
+		strcpy(k_buffer, "none\n");
+		pr_info("%s: overflow 1!\n", __func__);
+	}
+
+	ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+				  0444,
+				  debug_dir,
+				  k_buffer,
+				  &kernel_debug_fops);
+	if (!ret) {
+		pr_info("%s: failed to create %s.\n",
+			__func__,
+			ORANGEFS_KMOD_DEBUG_FILE);
+		goto out;
+	}
+
 	rc = 0;
 
 out:
 
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
 	return rc;
 }
 
+
 void orangefs_debugfs_cleanup(void)
 {
 	debugfs_remove_recursive(debug_dir);
@@ -196,49 +317,6 @@ static int help_show(struct seq_file *m, void *v)
 }
 
 /*
- * initialize the kernel-debug file.
- */
-int orangefs_kernel_debug_init(void)
-{
-	int rc = -ENOMEM;
-	struct dentry *ret;
-	char *k_buffer = NULL;
-
-	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
-
-	k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
-	if (!k_buffer)
-		goto out;
-
-	if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
-		strcpy(k_buffer, kernel_debug_string);
-		strcat(k_buffer, "\n");
-	} else {
-		strcpy(k_buffer, "none\n");
-		pr_info("%s: overflow 1!\n", __func__);
-	}
-
-	ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
-				  0444,
-				  debug_dir,
-				  k_buffer,
-				  &kernel_debug_fops);
-	if (!ret) {
-		pr_info("%s: failed to create %s.\n",
-			__func__,
-			ORANGEFS_KMOD_DEBUG_FILE);
-		goto out;
-	}
-
-	rc = 0;
-
-out:
-
-	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-	return rc;
-}
-
-/*
  * initialize the client-debug file.
  */
 int orangefs_client_debug_init(void)
@@ -282,7 +360,7 @@ out:
 }
 
 /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
-int orangefs_debug_open(struct inode *inode, struct file *file)
+static int orangefs_debug_open(struct inode *inode, struct file *file)
 {
 	int rc = -ENODEV;
 
@@ -384,8 +462,8 @@ static ssize_t orangefs_debug_write(struct file *file,
 	 */
 	if (!strcmp(file->f_path.dentry->d_name.name,
 		    ORANGEFS_KMOD_DEBUG_FILE)) {
-		debug_string_to_mask(buf, &gossip_debug_mask, 0);
-		debug_mask_to_string(&gossip_debug_mask, 0);
+		debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0);
+		debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
 		debug_string = kernel_debug_string;
 		gossip_debug(GOSSIP_DEBUGFS_DEBUG,
 			     "New kernel debug string is %s\n",
@@ -452,3 +530,546 @@ out:
 	kfree(buf);
 	return rc;
 }
+
+/*
+ * After obtaining a string representation of the client's debug
+ * keywords and their associated masks, this function is called to build an
+ * array of these values.
+ */
+static int orangefs_prepare_cdm_array(char *debug_array_string)
+{
+	int i;
+	int rc = -EINVAL;
+	char *cds_head = NULL;
+	char *cds_delimiter = NULL;
+	int keyword_len = 0;
+
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+	/*
+	 * figure out how many elements the cdm_array needs.
+	 */
+	for (i = 0; i < strlen(debug_array_string); i++)
+		if (debug_array_string[i] == '\n')
+			cdm_element_count++;
+
+	if (!cdm_element_count) {
+		pr_info("No elements in client debug array string!\n");
+		goto out;
+	}
+
+	cdm_array =
+		kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
+			GFP_KERNEL);
+	if (!cdm_array) {
+		pr_info("malloc failed for cdm_array!\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cds_head = debug_array_string;
+
+	for (i = 0; i < cdm_element_count; i++) {
+		cds_delimiter = strchr(cds_head, '\n');
+		*cds_delimiter = '\0';
+
+		keyword_len = strcspn(cds_head, " ");
+
+		cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
+		if (!cdm_array[i].keyword) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		sscanf(cds_head,
+		       "%s %llx %llx",
+		       cdm_array[i].keyword,
+		       (unsigned long long *)&(cdm_array[i].mask1),
+		       (unsigned long long *)&(cdm_array[i].mask2));
+
+		if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
+			client_verbose_index = i;
+
+		if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
+			client_all_index = i;
+
+		cds_head = cds_delimiter + 1;
+	}
+
+	rc = cdm_element_count;
+
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+
+out:
+
+	return rc;
+
+}
+
+/*
+ * /sys/kernel/debug/orangefs/debug-help can be catted to
+ * see all the available kernel and client debug keywords.
+ *
+ * When the kernel boots, we have no idea what keywords the
+ * client supports, nor their associated masks.
+ *
+ * We pass through this function once at boot and stamp a
+ * boilerplate "we don't know" message for the client in the
+ * debug-help file. We pass through here again when the client
+ * starts and then we can fill out the debug-help file fully.
+ *
+ * The client might be restarted any number of times between
+ * reboots, we only build the debug-help file the first time.
+ */
+int orangefs_prepare_debugfs_help_string(int at_boot)
+{
+	int rc = -EINVAL;
+	int i;
+	int byte_count = 0;
+	char *client_title = "Client Debug Keywords:\n";
+	char *kernel_title = "Kernel Debug Keywords:\n";
+
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+	if (at_boot) {
+		byte_count += strlen(HELP_STRING_UNINITIALIZED);
+		client_title = HELP_STRING_UNINITIALIZED;
+	} else {
+		/*
+		 * fill the client keyword/mask array and remember
+		 * how many elements there were.
+		 */
+		cdm_element_count =
+			orangefs_prepare_cdm_array(client_debug_array_string);
+		if (cdm_element_count <= 0)
+			goto out;
+
+		/* Count the bytes destined for debug_help_string. */
+		byte_count += strlen(client_title);
+
+		for (i = 0; i < cdm_element_count; i++) {
+			byte_count += strlen(cdm_array[i].keyword + 2);
+			if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+				pr_info("%s: overflow 1!\n", __func__);
+				goto out;
+			}
+		}
+
+		gossip_debug(GOSSIP_UTILS_DEBUG,
+			     "%s: cdm_element_count:%d:\n",
+			     __func__,
+			     cdm_element_count);
+	}
+
+	byte_count += strlen(kernel_title);
+	for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+		byte_count +=
+			strlen(s_kmod_keyword_mask_map[i].keyword + 2);
+		if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+			pr_info("%s: overflow 2!\n", __func__);
+			goto out;
+		}
+	}
+
+	/* build debug_help_string. */
+	debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+	if (!debug_help_string) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	strcat(debug_help_string, client_title);
+
+	if (!at_boot) {
+		for (i = 0; i < cdm_element_count; i++) {
+			strcat(debug_help_string, "\t");
+			strcat(debug_help_string, cdm_array[i].keyword);
+			strcat(debug_help_string, "\n");
+		}
+	}
+
+	strcat(debug_help_string, "\n");
+	strcat(debug_help_string, kernel_title);
+
+	for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+		strcat(debug_help_string, "\t");
+		strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
+		strcat(debug_help_string, "\n");
+	}
+
+	rc = 0;
+
+out:
+
+	return rc;
+
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+static void debug_mask_to_string(void *mask, int type)
+{
+	int i;
+	int len = 0;
+	char *debug_string;
+	int element_count = 0;
+
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+	if (type) {
+		debug_string = client_debug_string;
+		element_count = cdm_element_count;
+	} else {
+		debug_string = kernel_debug_string;
+		element_count = num_kmod_keyword_mask_map;
+	}
+
+	memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+
+	/*
+	 * Some keywords, like "all" or "verbose", are amalgams of
+	 * numerous other keywords. Make a special check for those
+	 * before grinding through the whole mask only to find out
+	 * later...
+	 */
+	if (check_amalgam_keyword(mask, type))
+		goto out;
+
+	/* Build the debug string. */
+	for (i = 0; i < element_count; i++)
+		if (type)
+			do_c_string(mask, i);
+		else
+			do_k_string(mask, i);
+
+	len = strlen(debug_string);
+
+	if ((len) && (type))
+		client_debug_string[len - 1] = '\0';
+	else if (len)
+		kernel_debug_string[len - 1] = '\0';
+	else if (type)
+		strcpy(client_debug_string, "none");
+	else
+		strcpy(kernel_debug_string, "none");
+
+out:
+gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
+
+	return;
+
+}
+
+static void do_k_string(void *k_mask, int index)
+{
+	__u64 *mask = (__u64 *) k_mask;
+
+	if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
+		goto out;
+
+	if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
+		if ((strlen(kernel_debug_string) +
+		     strlen(s_kmod_keyword_mask_map[index].keyword))
+			< ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+				strcat(kernel_debug_string,
+				       s_kmod_keyword_mask_map[index].keyword);
+				strcat(kernel_debug_string, ",");
+			} else {
+				gossip_err("%s: overflow!\n", __func__);
+				strcpy(kernel_debug_string, ORANGEFS_ALL);
+				goto out;
+			}
+	}
+
+out:
+
+	return;
+}
+
+static void do_c_string(void *c_mask, int index)
+{
+	struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
+
+	if (keyword_is_amalgam(cdm_array[index].keyword))
+		goto out;
+
+	if ((mask->mask1 & cdm_array[index].mask1) ||
+	    (mask->mask2 & cdm_array[index].mask2)) {
+		if ((strlen(client_debug_string) +
+		     strlen(cdm_array[index].keyword) + 1)
+			< ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+				strcat(client_debug_string,
+				       cdm_array[index].keyword);
+				strcat(client_debug_string, ",");
+			} else {
+				gossip_err("%s: overflow!\n", __func__);
+				strcpy(client_debug_string, ORANGEFS_ALL);
+				goto out;
+			}
+	}
+out:
+	return;
+}
+
+static int keyword_is_amalgam(char *keyword)
+{
+	int rc = 0;
+
+	if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
+		rc = 1;
+
+	return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ *
+ * return 1 if we found an amalgam.
+ */
+static int check_amalgam_keyword(void *mask, int type)
+{
+	__u64 *k_mask;
+	struct client_debug_mask *c_mask;
+	int k_all_index = num_kmod_keyword_mask_map - 1;
+	int rc = 0;
+
+	if (type) {
+		c_mask = (struct client_debug_mask *) mask;
+
+		if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
+		    (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
+			strcpy(client_debug_string, ORANGEFS_ALL);
+			rc = 1;
+			goto out;
+		}
+
+		if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
+		    (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
+			strcpy(client_debug_string, ORANGEFS_VERBOSE);
+			rc = 1;
+			goto out;
+		}
+
+	} else {
+		k_mask = (__u64 *) mask;
+
+		if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
+			strcpy(kernel_debug_string, ORANGEFS_ALL);
+			rc = 1;
+			goto out;
+		}
+	}
+
+out:
+
+	return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+static void debug_string_to_mask(char *debug_string, void *mask, int type)
+{
+	char *unchecked_keyword;
+	int i;
+	char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
+	char *original_pointer;
+	int element_count = 0;
+	struct client_debug_mask *c_mask = NULL;
+	__u64 *k_mask = NULL;
+
+	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+	if (type) {
+		c_mask = (struct client_debug_mask *)mask;
+		element_count = cdm_element_count;
+	} else {
+		k_mask = (__u64 *)mask;
+		*k_mask = 0;
+		element_count = num_kmod_keyword_mask_map;
+	}
+
+	original_pointer = strsep_fodder;
+	while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
+		if (strlen(unchecked_keyword)) {
+			for (i = 0; i < element_count; i++)
+				if (type)
+					do_c_mask(i,
+						  unchecked_keyword,
+						  &c_mask);
+				else
+					do_k_mask(i,
+						  unchecked_keyword,
+						  &k_mask);
+		}
+
+	kfree(original_pointer);
+}
+
+static void do_c_mask(int i, char *unchecked_keyword,
+    struct client_debug_mask **sane_mask)
+{
+
+	if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
+		(**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
+		(**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
+	}
+}
+
+static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
+{
+
+	if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
+		**sane_mask = (**sane_mask) |
+				s_kmod_keyword_mask_map[i].mask_val;
+}
+
+int orangefs_debugfs_new_client_mask(void __user *arg)
+{
+	struct dev_mask2_info_s mask2_info = {0};
+	int ret;
+
+	ret = copy_from_user(&mask2_info,
+			     (void __user *)arg,
+			     sizeof(struct dev_mask2_info_s));
+
+	if (ret != 0)
+		return -EIO;
+
+	client_debug_mask.mask1 = mask2_info.mask1_value;
+	client_debug_mask.mask2 = mask2_info.mask2_value;
+
+	pr_info("%s: client debug mask has been been received "
+		":%llx: :%llx:\n",
+		__func__,
+		(unsigned long long)client_debug_mask.mask1,
+		(unsigned long long)client_debug_mask.mask2);
+
+	return ret;
+}
+
+int orangefs_debugfs_new_client_string(void __user *arg) 
+{
+	int ret;
+
+	ret = copy_from_user(&client_debug_array_string,
+                                     (void __user *)arg,
+                                     ORANGEFS_MAX_DEBUG_STRING_LEN);
+	if (ret != 0)
+		return -EIO;
+
+	/*
+	 * The real client-core makes an effort to ensure
+	 * that actual strings that aren't too long to fit in
+	 * this buffer is what we get here. We're going to use
+	 * string functions on the stuff we got, so we'll make
+	 * this extra effort to try and keep from
+	 * flowing out of this buffer when we use the string
+	 * functions, even if somehow the stuff we end up
+	 * with here is garbage.
+	 */
+	client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
+		'\0';
+	
+	if (ret != 0) {
+		pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+			__func__);
+		return -EIO;
+	}
+
+	pr_info("%s: client debug array string has been received.\n",
+		__func__);
+
+	if (!help_string_initialized) {
+
+		/* Free the "we don't know yet" default string... */
+		kfree(debug_help_string);
+
+		/* build a proper debug help string */
+		if (orangefs_prepare_debugfs_help_string(0)) {
+			gossip_err("%s: no debug help string \n",
+				   __func__);
+			return -EIO;
+		}
+
+		/* Replace the boilerplate boot-time debug-help file. */
+		debugfs_remove(help_file_dentry);
+
+		help_file_dentry =
+			debugfs_create_file(
+				ORANGEFS_KMOD_DEBUG_HELP_FILE,
+				0444,
+				debug_dir,
+				debug_help_string,
+				&debug_help_fops);
+
+		if (!help_file_dentry) {
+			gossip_err("%s: debugfs_create_file failed for"
+				   " :%s:!\n",
+				   __func__,
+				   ORANGEFS_KMOD_DEBUG_HELP_FILE);
+			return -EIO;
+		}
+	}
+
+	debug_mask_to_string(&client_debug_mask, 1);
+
+	debugfs_remove(client_debug_dentry);
+
+	orangefs_client_debug_init();
+
+	help_string_initialized++;
+
+	return ret;
+}
+
+int orangefs_debugfs_new_debug(void __user *arg) 
+{
+	struct dev_mask_info_s mask_info = {0};
+	int ret;
+
+	ret = copy_from_user(&mask_info,
+			     (void __user *)arg,
+			     sizeof(mask_info));
+
+	if (ret != 0)
+		return -EIO;
+
+	if (mask_info.mask_type == KERNEL_MASK) {
+		if ((mask_info.mask_value == 0)
+		    && (kernel_mask_set_mod_init)) {
+			/*
+			 * the kernel debug mask was set when the
+			 * kernel module was loaded; don't override
+			 * it if the client-core was started without
+			 * a value for ORANGEFS_KMODMASK.
+			 */
+			return 0;
+		}
+		debug_mask_to_string(&mask_info.mask_value,
+				     mask_info.mask_type);
+		orangefs_gossip_debug_mask = mask_info.mask_value;
+		pr_info("%s: kernel debug mask has been modified to "
+			":%s: :%llx:\n",
+			__func__,
+			kernel_debug_string,
+			(unsigned long long)orangefs_gossip_debug_mask);
+	} else if (mask_info.mask_type == CLIENT_MASK) {
+		debug_mask_to_string(&mask_info.mask_value,
+				     mask_info.mask_type);
+		pr_info("%s: client debug mask has been modified to"
+			":%s: :%llx:\n",
+			__func__,
+			client_debug_string,
+			llu(mask_info.mask_value));
+	} else {
+		gossip_lerr("Invalid mask type....\n");
+		return -EINVAL;
+	}
+
+	return ret;
+}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
index e4828c0e3ef9..803517269ba6 100644
--- a/fs/orangefs/orangefs-debugfs.h
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -1,3 +1,7 @@
-int orangefs_debugfs_init(void);
-int orangefs_kernel_debug_init(void);
+int orangefs_debugfs_init(int);
 void orangefs_debugfs_cleanup(void);
+int orangefs_client_debug_init(void);
+int orangefs_prepare_debugfs_help_string(int);
+int orangefs_debugfs_new_client_mask(void __user *);
+int orangefs_debugfs_new_client_string(void __user *);
+int orangefs_debugfs_new_debug(void __user *);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index 9eac9d9a3f3a..a3d84ffee905 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -28,7 +28,7 @@
 #define ORANGEFS_VFS_OP_RENAME         0xFF00000A
 #define ORANGEFS_VFS_OP_STATFS         0xFF00000B
 #define ORANGEFS_VFS_OP_TRUNCATE       0xFF00000C
-#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH  0xFF00000D
+#define ORANGEFS_VFS_OP_RA_FLUSH       0xFF00000D
 #define ORANGEFS_VFS_OP_FS_MOUNT       0xFF00000E
 #define ORANGEFS_VFS_OP_FS_UMOUNT      0xFF00000F
 #define ORANGEFS_VFS_OP_GETXATTR       0xFF000010
@@ -41,6 +41,10 @@
 #define ORANGEFS_VFS_OP_FSYNC          0xFF00EE01
 #define ORANGEFS_VFS_OP_FSKEY             0xFF00EE02
 #define ORANGEFS_VFS_OP_READDIRPLUS       0xFF00EE03
+#define ORANGEFS_VFS_OP_FEATURES	0xFF00EE05 /* 2.9.6 */
+
+/* features is a 64-bit unsigned bitmask */
+#define ORANGEFS_FEATURE_READAHEAD 1
 
 /*
  * Misc constants. Please retain them as multiples of 8!
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 633c07a6e3d8..0a82048f3aaf 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -100,16 +100,6 @@ enum orangefs_vfs_op_states {
 };
 
 /*
- * An array of client_debug_mask will be built to hold debug keyword/mask
- * values fetched from userspace.
- */
-struct client_debug_mask {
-	char *keyword;
-	__u64 mask1;
-	__u64 mask2;
-};
-
-/*
  * orangefs kernel memory related flags
  */
 
@@ -119,29 +109,6 @@ struct client_debug_mask {
 #define ORANGEFS_CACHE_CREATE_FLAGS 0
 #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
 
-/* these functions are defined in orangefs-utils.c */
-int orangefs_prepare_cdm_array(char *debug_array_string);
-int orangefs_prepare_debugfs_help_string(int);
-
-/* defined in orangefs-debugfs.c */
-int orangefs_client_debug_init(void);
-
-void debug_string_to_mask(char *, void *, int);
-void do_c_mask(int, char *, struct client_debug_mask **);
-void do_k_mask(int, char *, __u64 **);
-
-void debug_mask_to_string(void *, int);
-void do_k_string(void *, int);
-void do_c_string(void *, int);
-int check_amalgam_keyword(void *, int);
-int keyword_is_amalgam(char *);
-
-/*these variables are defined in orangefs-mod.c */
-extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern unsigned int kernel_mask_set_mod_init;
-
 extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
 extern const struct xattr_handler *orangefs_xattr_handlers[];
 
@@ -331,7 +298,7 @@ struct orangefs_stats {
 	unsigned long writes;
 };
 
-extern struct orangefs_stats g_orangefs_stats;
+extern struct orangefs_stats orangefs_stats;
 
 /*
  * NOTE: See Documentation/filesystems/porting for information
@@ -447,6 +414,8 @@ void purge_waiting_ops(void);
 /*
  * defined in super.c
  */
+extern uint64_t orangefs_features;
+
 struct dentry *orangefs_mount(struct file_system_type *fst,
 			   int flags,
 			   const char *devname,
@@ -506,6 +475,8 @@ ssize_t orangefs_inode_read(struct inode *inode,
 /*
  * defined in devorangefs-req.c
  */
+extern uint32_t orangefs_userspace_version;
+
 int orangefs_dev_init(void);
 void orangefs_dev_cleanup(void);
 int is_daemon_in_service(void);
@@ -543,20 +514,18 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
 
 int orangefs_normalize_to_errno(__s32 error_code);
 
-extern struct mutex devreq_mutex;
-extern struct mutex request_mutex;
-extern int debug;
+extern struct mutex orangefs_request_mutex;
 extern int op_timeout_secs;
 extern int slot_timeout_secs;
-extern int dcache_timeout_msecs;
-extern int getattr_timeout_msecs;
+extern int orangefs_dcache_timeout_msecs;
+extern int orangefs_getattr_timeout_msecs;
 extern struct list_head orangefs_superblocks;
 extern spinlock_t orangefs_superblocks_lock;
 extern struct list_head orangefs_request_list;
 extern spinlock_t orangefs_request_list_lock;
 extern wait_queue_head_t orangefs_request_list_waitq;
-extern struct list_head *htable_ops_in_progress;
-extern spinlock_t htable_ops_in_progress_lock;
+extern struct list_head *orangefs_htable_ops_in_progress;
+extern spinlock_t orangefs_htable_ops_in_progress_lock;
 extern int hash_table_size;
 
 extern const struct address_space_operations orangefs_address_operations;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index e9fd5755c05f..2e5b03065f34 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -21,34 +21,17 @@
  * global variables declared here
  */
 
-/* array of client debug keyword/mask values */
-struct client_debug_mask *cdm_array;
-int cdm_element_count;
-
-char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
-char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-
-char *debug_help_string;
-int help_string_initialized;
-struct dentry *help_file_dentry;
-struct dentry *client_debug_dentry;
-struct dentry *debug_dir;
-int client_verbose_index;
-int client_all_index;
-struct orangefs_stats g_orangefs_stats;
+struct orangefs_stats orangefs_stats;
 
 /* the size of the hash tables for ops in progress */
 int hash_table_size = 509;
 
 static ulong module_parm_debug_mask;
-__u64 gossip_debug_mask;
-struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
-unsigned int kernel_mask_set_mod_init; /* implicitly false */
+__u64 orangefs_gossip_debug_mask;
 int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
 int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
-int dcache_timeout_msecs = 50;
-int getattr_timeout_msecs = 50;
+int orangefs_dcache_timeout_msecs = 50;
+int orangefs_getattr_timeout_msecs = 50;
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("ORANGEFS Development Team");
@@ -71,20 +54,17 @@ module_param(module_parm_debug_mask, ulong, 0644);
 module_param(op_timeout_secs, int, 0);
 module_param(slot_timeout_secs, int, 0);
 
-/* synchronizes the request device file */
-DEFINE_MUTEX(devreq_mutex);
-
 /*
  * Blocks non-priority requests from being queued for servicing.  This
  * could be used for protecting the request list data structure, but
  * for now it's only being used to stall the op addition to the request
  * list
  */
-DEFINE_MUTEX(request_mutex);
+DEFINE_MUTEX(orangefs_request_mutex);
 
 /* hash table for storing operations waiting for matching downcall */
-struct list_head *htable_ops_in_progress;
-DEFINE_SPINLOCK(htable_ops_in_progress_lock);
+struct list_head *orangefs_htable_ops_in_progress;
+DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock);
 
 /* list for queueing upcall operations */
 LIST_HEAD(orangefs_request_list);
@@ -100,32 +80,6 @@ static int __init orangefs_init(void)
 	int ret = -1;
 	__u32 i = 0;
 
-	/* convert input debug mask to a 64-bit unsigned integer */
-	gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
-
-	/*
-	 * set the kernel's gossip debug string; invalid mask values will
-	 * be ignored.
-	 */
-	debug_mask_to_string(&gossip_debug_mask, 0);
-
-	/* remove any invalid values from the mask */
-	debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
-
-	/*
-	 * if the mask has a non-zero value, then indicate that the mask
-	 * was set when the kernel module was loaded.  The orangefs dev ioctl
-	 * command will look at this boolean to determine if the kernel's
-	 * debug mask should be overwritten when the client-core is started.
-	 */
-	if (gossip_debug_mask != 0)
-		kernel_mask_set_mod_init = true;
-
-	pr_info("%s: called with debug mask: :%s: :%llx:\n",
-		__func__,
-		kernel_debug_string,
-		(unsigned long long)gossip_debug_mask);
-
 	ret = bdi_init(&orangefs_backing_dev_info);
 
 	if (ret)
@@ -146,9 +100,9 @@ static int __init orangefs_init(void)
 	if (ret < 0)
 		goto cleanup_op;
 
-	htable_ops_in_progress =
+	orangefs_htable_ops_in_progress =
 	    kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
-	if (!htable_ops_in_progress) {
+	if (!orangefs_htable_ops_in_progress) {
 		gossip_err("Failed to initialize op hashtable");
 		ret = -ENOMEM;
 		goto cleanup_inode;
@@ -156,7 +110,7 @@ static int __init orangefs_init(void)
 
 	/* initialize a doubly linked at each hash table index */
 	for (i = 0; i < hash_table_size; i++)
-		INIT_LIST_HEAD(&htable_ops_in_progress[i]);
+		INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]);
 
 	ret = fsid_key_table_initialize();
 	if (ret < 0)
@@ -179,14 +133,10 @@ static int __init orangefs_init(void)
 	if (ret)
 		goto cleanup_key_table;
 
-	ret = orangefs_debugfs_init();
+	ret = orangefs_debugfs_init(module_parm_debug_mask);
 	if (ret)
 		goto debugfs_init_failed;
 
-	ret = orangefs_kernel_debug_init();
-	if (ret)
-		goto kernel_debug_init_failed;
-
 	ret = orangefs_sysfs_init();
 	if (ret)
 		goto sysfs_init_failed;
@@ -214,8 +164,6 @@ cleanup_device:
 
 sysfs_init_failed:
 
-kernel_debug_init_failed:
-
 debugfs_init_failed:
 	orangefs_debugfs_cleanup();
 
@@ -223,7 +171,7 @@ cleanup_key_table:
 	fsid_key_table_finalize();
 
 cleanup_progress_table:
-	kfree(htable_ops_in_progress);
+	kfree(orangefs_htable_ops_in_progress);
 
 cleanup_inode:
 	orangefs_inode_cache_finalize();
@@ -250,12 +198,12 @@ static void __exit orangefs_exit(void)
 	orangefs_dev_cleanup();
 	BUG_ON(!list_empty(&orangefs_request_list));
 	for (i = 0; i < hash_table_size; i++)
-		BUG_ON(!list_empty(&htable_ops_in_progress[i]));
+		BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i]));
 
 	orangefs_inode_cache_finalize();
 	op_cache_finalize();
 
-	kfree(htable_ops_in_progress);
+	kfree(orangefs_htable_ops_in_progress);
 
 	bdi_destroy(&orangefs_backing_dev_info);
 
@@ -274,10 +222,10 @@ void purge_inprogress_ops(void)
 		struct orangefs_kernel_op_s *op;
 		struct orangefs_kernel_op_s *next;
 
-		spin_lock(&htable_ops_in_progress_lock);
+		spin_lock(&orangefs_htable_ops_in_progress_lock);
 		list_for_each_entry_safe(op,
 					 next,
-					 &htable_ops_in_progress[i],
+					 &orangefs_htable_ops_in_progress[i],
 					 list) {
 			set_op_state_purged(op);
 			gossip_debug(GOSSIP_DEV_DEBUG,
@@ -287,7 +235,7 @@ void purge_inprogress_ops(void)
 				     op->op_state,
 				     current->comm);
 		}
-		spin_unlock(&htable_ops_in_progress_lock);
+		spin_unlock(&orangefs_htable_ops_in_progress_lock);
 	}
 }
 
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 375708c2db87..a799546a67f7 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -73,6 +73,24 @@
  * Description:
  *			Time getattr is valid in milliseconds.
  *
+ * What:		/sys/fs/orangefs/readahead_count
+ * Date:		Aug 2016
+ * Contact:		Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *			Readahead cache buffer count.
+ *
+ * What:		/sys/fs/orangefs/readahead_size
+ * Date:		Aug 2016
+ * Contact:		Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *			Readahead cache buffer size.
+ *
+ * What:		/sys/fs/orangefs/readahead_count_size
+ * Date:		Aug 2016
+ * Contact:		Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *			Readahead cache buffer count and size.
+ *
  * What:		/sys/fs/orangefs/acache/...
  * Date:		Jun 2015
  * Contact:		Martin Brandenburg <martin@omnibond.com>
@@ -121,159 +139,34 @@
 #define PC_KOBJ_ID "pc"
 #define STATS_KOBJ_ID "stats"
 
-struct orangefs_obj {
-	struct kobject kobj;
-	int op_timeout_secs;
-	int perf_counter_reset;
-	int perf_history_size;
-	int perf_time_interval_secs;
-	int slot_timeout_secs;
-	int dcache_timeout_msecs;
-	int getattr_timeout_msecs;
-};
-
-struct acache_orangefs_obj {
-	struct kobject kobj;
-	int hard_limit;
-	int reclaim_percentage;
-	int soft_limit;
-	int timeout_msecs;
-};
-
-struct capcache_orangefs_obj {
-	struct kobject kobj;
-	int hard_limit;
-	int reclaim_percentage;
-	int soft_limit;
-	int timeout_secs;
-};
-
-struct ccache_orangefs_obj {
-	struct kobject kobj;
-	int hard_limit;
-	int reclaim_percentage;
-	int soft_limit;
-	int timeout_secs;
-};
-
-struct ncache_orangefs_obj {
-	struct kobject kobj;
-	int hard_limit;
-	int reclaim_percentage;
-	int soft_limit;
-	int timeout_msecs;
-};
-
-struct pc_orangefs_obj {
-	struct kobject kobj;
-	char *acache;
-	char *capcache;
-	char *ncache;
-};
-
-struct stats_orangefs_obj {
-	struct kobject kobj;
-	int reads;
-	int writes;
-};
+/*
+ * Every item calls orangefs_attr_show and orangefs_attr_store through
+ * orangefs_sysfs_ops. They look at the orangefs_attributes further below to
+ * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or
+ * sysfs_service_op_store.
+ */
 
 struct orangefs_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct orangefs_obj *orangefs_obj,
+	ssize_t (*show)(struct kobject *kobj,
 			struct orangefs_attribute *attr,
 			char *buf);
-	ssize_t (*store)(struct orangefs_obj *orangefs_obj,
+	ssize_t (*store)(struct kobject *kobj,
 			 struct orangefs_attribute *attr,
 			 const char *buf,
 			 size_t count);
 };
 
-struct acache_orangefs_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
-			struct acache_orangefs_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
-			 struct acache_orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count);
-};
-
-struct capcache_orangefs_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
-			struct capcache_orangefs_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
-			 struct capcache_orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count);
-};
-
-struct ccache_orangefs_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
-			struct ccache_orangefs_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
-			 struct ccache_orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count);
-};
-
-struct ncache_orangefs_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
-			struct ncache_orangefs_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
-			 struct ncache_orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count);
-};
-
-struct pc_orangefs_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
-			struct pc_orangefs_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
-			 struct pc_orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count);
-};
-
-struct stats_orangefs_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
-			struct stats_orangefs_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
-			 struct stats_orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count);
-};
-
 static ssize_t orangefs_attr_show(struct kobject *kobj,
 				  struct attribute *attr,
 				  char *buf)
 {
 	struct orangefs_attribute *attribute;
-	struct orangefs_obj *orangefs_obj;
-	int rc;
 
 	attribute = container_of(attr, struct orangefs_attribute, attr);
-	orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(orangefs_obj, attribute, buf);
-
-out:
-	return rc;
+	if (!attribute->show)
+		return -EIO;
+	return attribute->show(kobj, attribute, buf);
 }
 
 static ssize_t orangefs_attr_store(struct kobject *kobj,
@@ -282,24 +175,15 @@ static ssize_t orangefs_attr_store(struct kobject *kobj,
 				   size_t len)
 {
 	struct orangefs_attribute *attribute;
-	struct orangefs_obj *orangefs_obj;
-	int rc;
 
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "orangefs_attr_store: start\n");
+	if (!strcmp(kobj->name, PC_KOBJ_ID) ||
+	    !strcmp(kobj->name, STATS_KOBJ_ID))
+		return -EPERM;
 
 	attribute = container_of(attr, struct orangefs_attribute, attr);
-	orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-
-	if (!attribute->store) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->store(orangefs_obj, attribute, buf, len);
-
-out:
-	return rc;
+	if (!attribute->store)
+		return -EIO;
+	return attribute->store(kobj, attribute, buf, len);
 }
 
 static const struct sysfs_ops orangefs_sysfs_ops = {
@@ -307,402 +191,58 @@ static const struct sysfs_ops orangefs_sysfs_ops = {
 	.store = orangefs_attr_store,
 };
 
-static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
-					 struct attribute *attr,
-					 char *buf)
-{
-	struct acache_orangefs_attribute *attribute;
-	struct acache_orangefs_obj *acache_orangefs_obj;
-	int rc;
-
-	attribute = container_of(attr, struct acache_orangefs_attribute, attr);
-	acache_orangefs_obj =
-		container_of(kobj, struct acache_orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(acache_orangefs_obj, attribute, buf);
-
-out:
-	return rc;
-}
-
-static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
-					  struct attribute *attr,
-					  const char *buf,
-					  size_t len)
-{
-	struct acache_orangefs_attribute *attribute;
-	struct acache_orangefs_obj *acache_orangefs_obj;
-	int rc;
-
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "acache_orangefs_attr_store: start\n");
-
-	attribute = container_of(attr, struct acache_orangefs_attribute, attr);
-	acache_orangefs_obj =
-		container_of(kobj, struct acache_orangefs_obj, kobj);
-
-	if (!attribute->store) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
-
-out:
-	return rc;
-}
-
-static const struct sysfs_ops acache_orangefs_sysfs_ops = {
-	.show = acache_orangefs_attr_show,
-	.store = acache_orangefs_attr_store,
-};
-
-static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
-					   struct attribute *attr,
-					   char *buf)
-{
-	struct capcache_orangefs_attribute *attribute;
-	struct capcache_orangefs_obj *capcache_orangefs_obj;
-	int rc;
-
-	attribute =
-		container_of(attr, struct capcache_orangefs_attribute, attr);
-	capcache_orangefs_obj =
-		container_of(kobj, struct capcache_orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(capcache_orangefs_obj, attribute, buf);
-
-out:
-	return rc;
-}
-
-static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
-					    struct attribute *attr,
-					    const char *buf,
-					    size_t len)
-{
-	struct capcache_orangefs_attribute *attribute;
-	struct capcache_orangefs_obj *capcache_orangefs_obj;
-	int rc;
-
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "capcache_orangefs_attr_store: start\n");
-
-	attribute =
-		container_of(attr, struct capcache_orangefs_attribute, attr);
-	capcache_orangefs_obj =
-		container_of(kobj, struct capcache_orangefs_obj, kobj);
-
-	if (!attribute->store) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
-
-out:
-	return rc;
-}
-
-static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
-	.show = capcache_orangefs_attr_show,
-	.store = capcache_orangefs_attr_store,
-};
-
-static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
-					 struct attribute *attr,
-					 char *buf)
-{
-	struct ccache_orangefs_attribute *attribute;
-	struct ccache_orangefs_obj *ccache_orangefs_obj;
-	int rc;
-
-	attribute =
-		container_of(attr, struct ccache_orangefs_attribute, attr);
-	ccache_orangefs_obj =
-		container_of(kobj, struct ccache_orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(ccache_orangefs_obj, attribute, buf);
-
-out:
-	return rc;
-}
-
-static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
-					  struct attribute *attr,
-					  const char *buf,
-					  size_t len)
-{
-	struct ccache_orangefs_attribute *attribute;
-	struct ccache_orangefs_obj *ccache_orangefs_obj;
-	int rc;
-
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "ccache_orangefs_attr_store: start\n");
-
-	attribute =
-		container_of(attr, struct ccache_orangefs_attribute, attr);
-	ccache_orangefs_obj =
-		container_of(kobj, struct ccache_orangefs_obj, kobj);
-
-	if (!attribute->store) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
-
-out:
-	return rc;
-}
-
-static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
-	.show = ccache_orangefs_attr_show,
-	.store = ccache_orangefs_attr_store,
-};
-
-static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
-					 struct attribute *attr,
-					 char *buf)
-{
-	struct ncache_orangefs_attribute *attribute;
-	struct ncache_orangefs_obj *ncache_orangefs_obj;
-	int rc;
-
-	attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
-	ncache_orangefs_obj =
-		container_of(kobj, struct ncache_orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(ncache_orangefs_obj, attribute, buf);
-
-out:
-	return rc;
-}
-
-static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
-					  struct attribute *attr,
-					  const char *buf,
-					  size_t len)
-{
-	struct ncache_orangefs_attribute *attribute;
-	struct ncache_orangefs_obj *ncache_orangefs_obj;
-	int rc;
-
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "ncache_orangefs_attr_store: start\n");
-
-	attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
-	ncache_orangefs_obj =
-		container_of(kobj, struct ncache_orangefs_obj, kobj);
-
-	if (!attribute->store) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
-
-out:
-	return rc;
-}
-
-static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
-	.show = ncache_orangefs_attr_show,
-	.store = ncache_orangefs_attr_store,
-};
-
-static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
-				     struct attribute *attr,
-				     char *buf)
-{
-	struct pc_orangefs_attribute *attribute;
-	struct pc_orangefs_obj *pc_orangefs_obj;
-	int rc;
-
-	attribute = container_of(attr, struct pc_orangefs_attribute, attr);
-	pc_orangefs_obj =
-		container_of(kobj, struct pc_orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(pc_orangefs_obj, attribute, buf);
-
-out:
-	return rc;
-}
-
-static const struct sysfs_ops pc_orangefs_sysfs_ops = {
-	.show = pc_orangefs_attr_show,
-};
-
-static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
-					struct attribute *attr,
-					char *buf)
-{
-	struct stats_orangefs_attribute *attribute;
-	struct stats_orangefs_obj *stats_orangefs_obj;
-	int rc;
-
-	attribute = container_of(attr, struct stats_orangefs_attribute, attr);
-	stats_orangefs_obj =
-		container_of(kobj, struct stats_orangefs_obj, kobj);
-
-	if (!attribute->show) {
-		rc = -EIO;
-		goto out;
-	}
-
-	rc = attribute->show(stats_orangefs_obj, attribute, buf);
-
-out:
-	return rc;
-}
-
-static const struct sysfs_ops stats_orangefs_sysfs_ops = {
-	.show = stats_orangefs_attr_show,
-};
-
-static void orangefs_release(struct kobject *kobj)
-{
-	struct orangefs_obj *orangefs_obj;
-
-	orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-	kfree(orangefs_obj);
-}
-
-static void acache_orangefs_release(struct kobject *kobj)
-{
-	struct acache_orangefs_obj *acache_orangefs_obj;
-
-	acache_orangefs_obj =
-		container_of(kobj, struct acache_orangefs_obj, kobj);
-	kfree(acache_orangefs_obj);
-}
-
-static void capcache_orangefs_release(struct kobject *kobj)
-{
-	struct capcache_orangefs_obj *capcache_orangefs_obj;
-
-	capcache_orangefs_obj =
-		container_of(kobj, struct capcache_orangefs_obj, kobj);
-	kfree(capcache_orangefs_obj);
-}
-
-static void ccache_orangefs_release(struct kobject *kobj)
-{
-	struct ccache_orangefs_obj *ccache_orangefs_obj;
-
-	ccache_orangefs_obj =
-		container_of(kobj, struct ccache_orangefs_obj, kobj);
-	kfree(ccache_orangefs_obj);
-}
-
-static void ncache_orangefs_release(struct kobject *kobj)
-{
-	struct ncache_orangefs_obj *ncache_orangefs_obj;
-
-	ncache_orangefs_obj =
-		container_of(kobj, struct ncache_orangefs_obj, kobj);
-	kfree(ncache_orangefs_obj);
-}
-
-static void pc_orangefs_release(struct kobject *kobj)
-{
-	struct pc_orangefs_obj *pc_orangefs_obj;
-
-	pc_orangefs_obj =
-		container_of(kobj, struct pc_orangefs_obj, kobj);
-	kfree(pc_orangefs_obj);
-}
-
-static void stats_orangefs_release(struct kobject *kobj)
-{
-	struct stats_orangefs_obj *stats_orangefs_obj;
-
-	stats_orangefs_obj =
-		container_of(kobj, struct stats_orangefs_obj, kobj);
-	kfree(stats_orangefs_obj);
-}
-
-static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
+static ssize_t sysfs_int_show(struct kobject *kobj,
+    struct orangefs_attribute *attr, char *buf)
 {
 	int rc = -EIO;
-	struct orangefs_attribute *orangefs_attr;
-	struct stats_orangefs_attribute *stats_orangefs_attr;
 
-	gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
+	gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n",
+	    kobj->name);
 
-	if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
-		orangefs_attr = (struct orangefs_attribute *)attr;
-
-		if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
+	if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "op_timeout_secs")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
 				       "%d\n",
 				       op_timeout_secs);
 			goto out;
-		} else if (!strcmp(orangefs_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "slot_timeout_secs")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
 				       "%d\n",
 				       slot_timeout_secs);
 			goto out;
-		} else if (!strcmp(orangefs_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "dcache_timeout_msecs")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
 				       "%d\n",
-				       dcache_timeout_msecs);
+				       orangefs_dcache_timeout_msecs);
 			goto out;
-		} else if (!strcmp(orangefs_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "getattr_timeout_msecs")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
 				       "%d\n",
-				       getattr_timeout_msecs);
+				       orangefs_getattr_timeout_msecs);
 			goto out;
 		} else {
 			goto out;
 		}
 
-	} else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
-		stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
-
-		if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
+	} else if (!strcmp(kobj->name, STATS_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "reads")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
 				       "%lu\n",
-				       g_orangefs_stats.reads);
+				       orangefs_stats.reads);
 			goto out;
-		} else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
+		} else if (!strcmp(attr->attr.name, "writes")) {
 			rc = scnprintf(buf,
 				       PAGE_SIZE,
 				       "%lu\n",
-				       g_orangefs_stats.writes);
+				       orangefs_stats.writes);
 			goto out;
 		} else {
 			goto out;
@@ -714,45 +254,13 @@ out:
 	return rc;
 }
 
-static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
-				 struct orangefs_attribute *attr,
-				 char *buf)
-{
-	int rc;
-
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "int_orangefs_show:start attr->attr.name:%s:\n",
-		     attr->attr.name);
-
-	rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
-
-	return rc;
-}
-
-static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
-			struct stats_orangefs_attribute *attr,
-			char *buf)
-{
-	int rc;
-
-	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "int_stats_show:start attr->attr.name:%s:\n",
-		     attr->attr.name);
-
-	rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
-
-	return rc;
-}
-
-static ssize_t int_store(struct orangefs_obj *orangefs_obj,
-			 struct orangefs_attribute *attr,
-			 const char *buf,
-			 size_t count)
+static ssize_t sysfs_int_store(struct kobject *kobj,
+    struct orangefs_attribute *attr, const char *buf, size_t count)
 {
 	int rc = 0;
 
 	gossip_debug(GOSSIP_SYSFS_DEBUG,
-		     "int_store: start attr->attr.name:%s: buf:%s:\n",
+		     "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n",
 		     attr->attr.name, buf);
 
 	if (!strcmp(attr->attr.name, "op_timeout_secs")) {
@@ -762,10 +270,10 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
 		rc = kstrtoint(buf, 0, &slot_timeout_secs);
 		goto out;
 	} else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
-		rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+		rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs);
 		goto out;
 	} else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
-		rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+		rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs);
 		goto out;
 	} else {
 		goto out;
@@ -783,24 +291,19 @@ out:
 /*
  * obtain attribute values from userspace with a service operation.
  */
-static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
+static ssize_t sysfs_service_op_show(struct kobject *kobj,
+    struct orangefs_attribute *attr, char *buf)
 {
 	struct orangefs_kernel_op_s *new_op = NULL;
 	int rc = 0;
 	char *ser_op_type = NULL;
-	struct orangefs_attribute *orangefs_attr;
-	struct acache_orangefs_attribute *acache_attr;
-	struct capcache_orangefs_attribute *capcache_attr;
-	struct ccache_orangefs_attribute *ccache_attr;
-	struct ncache_orangefs_attribute *ncache_attr;
-	struct pc_orangefs_attribute *pc_attr;
 	__u32 op_alloc_type;
 
 	gossip_debug(GOSSIP_SYSFS_DEBUG,
 		     "sysfs_service_op_show: id:%s:\n",
-		     kobj_id);
+		     kobj->name);
 
-	if (strcmp(kobj_id, PC_KOBJ_ID))
+	if (strcmp(kobj->name, PC_KOBJ_ID))
 		op_alloc_type = ORANGEFS_VFS_OP_PARAM;
 	else
 		op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
@@ -818,124 +321,135 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
 		goto out;
 	}
 
-	if (strcmp(kobj_id, PC_KOBJ_ID))
+	if (strcmp(kobj->name, PC_KOBJ_ID))
 		new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
 
-	if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
-		orangefs_attr = (struct orangefs_attribute *)attr;
+	if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+		/* Drop unsupported requests first. */
+		if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) &&
+		    (!strcmp(attr->attr.name, "readahead_count") ||
+		    !strcmp(attr->attr.name, "readahead_size") ||
+		    !strcmp(attr->attr.name, "readahead_count_size"))) {
+			rc = -EINVAL;
+			goto out;
+		}
 
-		if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
+		if (!strcmp(attr->attr.name, "perf_history_size"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
-		else if (!strcmp(orangefs_attr->attr.name,
+		else if (!strcmp(attr->attr.name,
 				 "perf_time_interval_secs"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
-		else if (!strcmp(orangefs_attr->attr.name,
+		else if (!strcmp(attr->attr.name,
 				 "perf_counter_reset"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
 
-	} else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
-		acache_attr = (struct acache_orangefs_attribute *)attr;
+		else if (!strcmp(attr->attr.name,
+				 "readahead_count"))
+			new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+
+		else if (!strcmp(attr->attr.name,
+				 "readahead_size"))
+			new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
 
-		if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
+		else if (!strcmp(attr->attr.name,
+				 "readahead_count_size"))
+			new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+	} else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "timeout_msecs"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
 
-		if (!strcmp(acache_attr->attr.name, "hard_limit"))
+		if (!strcmp(attr->attr.name, "hard_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
 
-		if (!strcmp(acache_attr->attr.name, "soft_limit"))
+		if (!strcmp(attr->attr.name, "soft_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
 
-		if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
+		if (!strcmp(attr->attr.name, "reclaim_percentage"))
 			new_op->upcall.req.param.op =
 			  ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
 
-	} else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
-		capcache_attr = (struct capcache_orangefs_attribute *)attr;
-
-		if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
+	} else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "timeout_secs"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
 
-		if (!strcmp(capcache_attr->attr.name, "hard_limit"))
+		if (!strcmp(attr->attr.name, "hard_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
 
-		if (!strcmp(capcache_attr->attr.name, "soft_limit"))
+		if (!strcmp(attr->attr.name, "soft_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
 
-		if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
+		if (!strcmp(attr->attr.name, "reclaim_percentage"))
 			new_op->upcall.req.param.op =
 			  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
 
-	} else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
-		ccache_attr = (struct ccache_orangefs_attribute *)attr;
-
-		if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
+	} else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "timeout_secs"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
 
-		if (!strcmp(ccache_attr->attr.name, "hard_limit"))
+		if (!strcmp(attr->attr.name, "hard_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
 
-		if (!strcmp(ccache_attr->attr.name, "soft_limit"))
+		if (!strcmp(attr->attr.name, "soft_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
 
-		if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
+		if (!strcmp(attr->attr.name, "reclaim_percentage"))
 			new_op->upcall.req.param.op =
 			  ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
 
-	} else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
-		ncache_attr = (struct ncache_orangefs_attribute *)attr;
-
-		if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
+	} else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "timeout_msecs"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
 
-		if (!strcmp(ncache_attr->attr.name, "hard_limit"))
+		if (!strcmp(attr->attr.name, "hard_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
 
-		if (!strcmp(ncache_attr->attr.name, "soft_limit"))
+		if (!strcmp(attr->attr.name, "soft_limit"))
 			new_op->upcall.req.param.op =
 				ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
 
-		if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
+		if (!strcmp(attr->attr.name, "reclaim_percentage"))
 			new_op->upcall.req.param.op =
 			  ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
 
-	} else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
-		pc_attr = (struct pc_orangefs_attribute *)attr;
-
-		if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
+	} else if (!strcmp(kobj->name, PC_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID))
 			new_op->upcall.req.perf_count.type =
 				ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
 
-		if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
+		if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID))
 			new_op->upcall.req.perf_count.type =
 				ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
 
-		if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
+		if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID))
 			new_op->upcall.req.perf_count.type =
 				ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
 
 	} else {
 		gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
-			   kobj_id);
+			   kobj->name);
 		rc = -EINVAL;
 		goto out;
 	}
 
 
-	if (strcmp(kobj_id, PC_KOBJ_ID))
+	if (strcmp(kobj->name, PC_KOBJ_ID))
 		ser_op_type = "orangefs_param";
 	else
 		ser_op_type = "orangefs_perf_count";
@@ -948,11 +462,18 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
 
 out:
 	if (!rc) {
-		if (strcmp(kobj_id, PC_KOBJ_ID)) {
-			rc = scnprintf(buf,
-				       PAGE_SIZE,
-				       "%d\n",
-				       (int)new_op->downcall.resp.param.value);
+		if (strcmp(kobj->name, PC_KOBJ_ID)) {
+			if (new_op->upcall.req.param.op ==
+			    ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) {
+				rc = scnprintf(buf, PAGE_SIZE, "%d %d\n",
+				    (int)new_op->downcall.resp.param.u.
+				    value32[0],
+				    (int)new_op->downcall.resp.param.u.
+				    value32[1]);
+			} else {
+				rc = scnprintf(buf, PAGE_SIZE, "%d\n",
+				    (int)new_op->downcall.resp.param.u.value64);
+			}
 		} else {
 			rc = scnprintf(
 				buf,
@@ -968,77 +489,6 @@ out:
 
 }
 
-static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
-				     struct orangefs_attribute *attr,
-				     char *buf)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
-
-	return rc;
-}
-
-static ssize_t
-	service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
-			    struct acache_orangefs_attribute *attr,
-			    char *buf)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
-
-	return rc;
-}
-
-static ssize_t service_capcache_show(struct capcache_orangefs_obj
-					*capcache_orangefs_obj,
-				     struct capcache_orangefs_attribute *attr,
-				     char *buf)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
-
-	return rc;
-}
-
-static ssize_t service_ccache_show(struct ccache_orangefs_obj
-					*ccache_orangefs_obj,
-				   struct ccache_orangefs_attribute *attr,
-				   char *buf)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
-
-	return rc;
-}
-
-static ssize_t
-	service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
-			    struct ncache_orangefs_attribute *attr,
-			    char *buf)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
-
-	return rc;
-}
-
-static ssize_t
-	service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
-			    struct pc_orangefs_attribute *attr,
-			    char *buf)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
-
-	return rc;
-}
-
 /*
  * pass attribute values back to userspace with a service operation.
  *
@@ -1050,20 +500,16 @@ static ssize_t
  * We want to return 1 if we think everything went OK, and
  * EINVAL if not.
  */
-static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
+static ssize_t sysfs_service_op_store(struct kobject *kobj,
+    struct orangefs_attribute *attr, const char *buf, size_t count)
 {
 	struct orangefs_kernel_op_s *new_op = NULL;
 	int val = 0;
 	int rc = 0;
-	struct orangefs_attribute *orangefs_attr;
-	struct acache_orangefs_attribute *acache_attr;
-	struct capcache_orangefs_attribute *capcache_attr;
-	struct ccache_orangefs_attribute *ccache_attr;
-	struct ncache_orangefs_attribute *ncache_attr;
 
 	gossip_debug(GOSSIP_SYSFS_DEBUG,
 		     "sysfs_service_op_store: id:%s:\n",
-		     kobj_id);
+		     kobj->name);
 
 	new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
 	if (!new_op)
@@ -1079,16 +525,29 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 	}
 
 	/*
-	 * The value we want to send back to userspace is in buf.
+	 * The value we want to send back to userspace is in buf, unless this
+	 * there are two parameters, which is specially handled below.
 	 */
-	rc = kstrtoint(buf, 0, &val);
-	if (rc)
-		goto out;
+	if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) ||
+	    strcmp(attr->attr.name, "readahead_count_size")) {
+		rc = kstrtoint(buf, 0, &val);
+		if (rc)
+			goto out;
+	}
 
-	if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
-		orangefs_attr = (struct orangefs_attribute *)attr;
+	new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+
+	if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+		/* Drop unsupported requests first. */
+		if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) &&
+		    (!strcmp(attr->attr.name, "readahead_count") ||
+		    !strcmp(attr->attr.name, "readahead_size") ||
+		    !strcmp(attr->attr.name, "readahead_count_size"))) {
+			rc = -EINVAL;
+			goto out;
+		}
 
-		if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
+		if (!strcmp(attr->attr.name, "perf_history_size")) {
 			if (val > 0) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
@@ -1096,7 +555,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(orangefs_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "perf_time_interval_secs")) {
 			if (val > 0) {
 				new_op->upcall.req.param.op =
@@ -1105,7 +564,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(orangefs_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "perf_counter_reset")) {
 			if ((val == 0) || (val == 1)) {
 				new_op->upcall.req.param.op =
@@ -1114,12 +573,55 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
+		} else if (!strcmp(attr->attr.name,
+				   "readahead_count")) {
+			if ((val >= 0)) {
+				new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+			} else {
+				rc = 0;
+				goto out;
+			}
+		} else if (!strcmp(attr->attr.name,
+				   "readahead_size")) {
+			if ((val >= 0)) {
+				new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
+			} else {
+				rc = 0;
+				goto out;
+			}
+		} else if (!strcmp(attr->attr.name,
+				   "readahead_count_size")) {
+			int val1, val2;
+			rc = sscanf(buf, "%d %d", &val1, &val2);
+			if (rc < 2) {
+				rc = 0;
+				goto out;
+			}
+			if ((val1 >= 0) && (val2 >= 0)) {
+				new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+			} else {
+				rc = 0;
+				goto out;
+			}
+			new_op->upcall.req.param.u.value32[0] = val1;
+			new_op->upcall.req.param.u.value32[1] = val2;
+			goto value_set;
+		} else if (!strcmp(attr->attr.name,
+				   "perf_counter_reset")) {
+			if ((val > 0)) {
+				new_op->upcall.req.param.op =
+				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+			} else {
+				rc = 0;
+				goto out;
+			}
 		}
 
-	} else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
-		acache_attr = (struct acache_orangefs_attribute *)attr;
-
-		if (!strcmp(acache_attr->attr.name, "hard_limit")) {
+	} else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "hard_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
@@ -1127,7 +629,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
+		} else if (!strcmp(attr->attr.name, "soft_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
@@ -1135,7 +637,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(acache_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "reclaim_percentage")) {
 			if ((val > -1) && (val < 101)) {
 				new_op->upcall.req.param.op =
@@ -1144,7 +646,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
+		} else if (!strcmp(attr->attr.name, "timeout_msecs")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
@@ -1154,10 +656,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 			}
 		}
 
-	} else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
-		capcache_attr = (struct capcache_orangefs_attribute *)attr;
-
-		if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
+	} else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "hard_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
@@ -1165,7 +665,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
+		} else if (!strcmp(attr->attr.name, "soft_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
@@ -1173,7 +673,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(capcache_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "reclaim_percentage")) {
 			if ((val > -1) && (val < 101)) {
 				new_op->upcall.req.param.op =
@@ -1182,7 +682,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
+		} else if (!strcmp(attr->attr.name, "timeout_secs")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
@@ -1192,10 +692,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 			}
 		}
 
-	} else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
-		ccache_attr = (struct ccache_orangefs_attribute *)attr;
-
-		if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
+	} else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "hard_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
@@ -1203,7 +701,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
+		} else if (!strcmp(attr->attr.name, "soft_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
@@ -1211,7 +709,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(ccache_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "reclaim_percentage")) {
 			if ((val > -1) && (val < 101)) {
 				new_op->upcall.req.param.op =
@@ -1220,7 +718,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
+		} else if (!strcmp(attr->attr.name, "timeout_secs")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
@@ -1230,10 +728,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 			}
 		}
 
-	} else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
-		ncache_attr = (struct ncache_orangefs_attribute *)attr;
-
-		if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
+	} else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) {
+		if (!strcmp(attr->attr.name, "hard_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
@@ -1241,7 +737,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
+		} else if (!strcmp(attr->attr.name, "soft_limit")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
@@ -1249,7 +745,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(ncache_attr->attr.name,
+		} else if (!strcmp(attr->attr.name,
 				   "reclaim_percentage")) {
 			if ((val > -1) && (val < 101)) {
 				new_op->upcall.req.param.op =
@@ -1258,7 +754,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 				rc = 0;
 				goto out;
 			}
-		} else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
+		} else if (!strcmp(attr->attr.name, "timeout_msecs")) {
 			if (val > -1) {
 				new_op->upcall.req.param.op =
 				  ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
@@ -1270,14 +766,13 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 
 	} else {
 		gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
-			   kobj_id);
+			   kobj->name);
 		rc = -EINVAL;
 		goto out;
 	}
 
-	new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
-
-	new_op->upcall.req.param.value = val;
+	new_op->upcall.req.param.u.value64 = val;
+value_set:
 
 	/*
 	 * The service_operation will return a errno return code on
@@ -1290,7 +785,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
 			rc);
 		rc = 0;
 	} else {
-		rc = 1;
+		rc = count;
 	}
 
 out:
@@ -1302,127 +797,56 @@ out:
 	return rc;
 }
 
-static ssize_t
-	service_orangefs_store(struct orangefs_obj *orangefs_obj,
-			       struct orangefs_attribute *attr,
-			       const char *buf,
-			       size_t count)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
-
-	/* rc should have an errno value if the service_op went bad. */
-	if (rc == 1)
-		rc = count;
-
-	return rc;
-}
-
-static ssize_t
-	service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
-			     struct acache_orangefs_attribute *attr,
-			     const char *buf,
-			     size_t count)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
-
-	/* rc should have an errno value if the service_op went bad. */
-	if (rc == 1)
-		rc = count;
-
-	return rc;
-}
-
-static ssize_t
-	service_capcache_store(struct capcache_orangefs_obj
-				*capcache_orangefs_obj,
-			       struct capcache_orangefs_attribute *attr,
-			       const char *buf,
-			       size_t count)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
-
-	/* rc should have an errno value if the service_op went bad. */
-	if (rc == 1)
-		rc = count;
-
-	return rc;
-}
-
-static ssize_t service_ccache_store(struct ccache_orangefs_obj
-					*ccache_orangefs_obj,
-				    struct ccache_orangefs_attribute *attr,
-				    const char *buf,
-				    size_t count)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
-
-	/* rc should have an errno value if the service_op went bad. */
-	if (rc == 1)
-		rc = count;
-
-	return rc;
-}
-
-static ssize_t
-	service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
-			     struct ncache_orangefs_attribute *attr,
-			     const char *buf,
-			     size_t count)
-{
-	int rc = 0;
-
-	rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
-
-	/* rc should have an errno value if the service_op went bad. */
-	if (rc == 1)
-		rc = count;
-
-	return rc;
-}
-
 static struct orangefs_attribute op_timeout_secs_attribute =
-	__ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
+	__ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
 
 static struct orangefs_attribute slot_timeout_secs_attribute =
-	__ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+	__ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
 
 static struct orangefs_attribute dcache_timeout_msecs_attribute =
-	__ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+	__ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
 
 static struct orangefs_attribute getattr_timeout_msecs_attribute =
-	__ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+	__ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
+
+static struct orangefs_attribute readahead_count_attribute =
+	__ATTR(readahead_count, 0664, sysfs_service_op_show,
+	       sysfs_service_op_store);
+
+static struct orangefs_attribute readahead_size_attribute =
+	__ATTR(readahead_size, 0664, sysfs_service_op_show,
+	       sysfs_service_op_store);
+
+static struct orangefs_attribute readahead_count_size_attribute =
+	__ATTR(readahead_count_size, 0664, sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct orangefs_attribute perf_counter_reset_attribute =
 	__ATTR(perf_counter_reset,
 	       0664,
-	       service_orangefs_show,
-	       service_orangefs_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct orangefs_attribute perf_history_size_attribute =
 	__ATTR(perf_history_size,
 	       0664,
-	       service_orangefs_show,
-	       service_orangefs_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct orangefs_attribute perf_time_interval_secs_attribute =
 	__ATTR(perf_time_interval_secs,
 	       0664,
-	       service_orangefs_show,
-	       service_orangefs_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct attribute *orangefs_default_attrs[] = {
 	&op_timeout_secs_attribute.attr,
 	&slot_timeout_secs_attribute.attr,
 	&dcache_timeout_msecs_attribute.attr,
 	&getattr_timeout_msecs_attribute.attr,
+	&readahead_count_attribute.attr,
+	&readahead_size_attribute.attr,
+	&readahead_count_size_attribute.attr,
 	&perf_counter_reset_attribute.attr,
 	&perf_history_size_attribute.attr,
 	&perf_time_interval_secs_attribute.attr,
@@ -1431,33 +855,32 @@ static struct attribute *orangefs_default_attrs[] = {
 
 static struct kobj_type orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
-	.release = orangefs_release,
 	.default_attrs = orangefs_default_attrs,
 };
 
-static struct acache_orangefs_attribute acache_hard_limit_attribute =
+static struct orangefs_attribute acache_hard_limit_attribute =
 	__ATTR(hard_limit,
 	       0664,
-	       service_acache_show,
-	       service_acache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
+static struct orangefs_attribute acache_reclaim_percent_attribute =
 	__ATTR(reclaim_percentage,
 	       0664,
-	       service_acache_show,
-	       service_acache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct acache_orangefs_attribute acache_soft_limit_attribute =
+static struct orangefs_attribute acache_soft_limit_attribute =
 	__ATTR(soft_limit,
 	       0664,
-	       service_acache_show,
-	       service_acache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
+static struct orangefs_attribute acache_timeout_msecs_attribute =
 	__ATTR(timeout_msecs,
 	       0664,
-	       service_acache_show,
-	       service_acache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct attribute *acache_orangefs_default_attrs[] = {
 	&acache_hard_limit_attribute.attr,
@@ -1468,34 +891,33 @@ static struct attribute *acache_orangefs_default_attrs[] = {
 };
 
 static struct kobj_type acache_orangefs_ktype = {
-	.sysfs_ops = &acache_orangefs_sysfs_ops,
-	.release = acache_orangefs_release,
+	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_attrs = acache_orangefs_default_attrs,
 };
 
-static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
+static struct orangefs_attribute capcache_hard_limit_attribute =
 	__ATTR(hard_limit,
 	       0664,
-	       service_capcache_show,
-	       service_capcache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
+static struct orangefs_attribute capcache_reclaim_percent_attribute =
 	__ATTR(reclaim_percentage,
 	       0664,
-	       service_capcache_show,
-	       service_capcache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
+static struct orangefs_attribute capcache_soft_limit_attribute =
 	__ATTR(soft_limit,
 	       0664,
-	       service_capcache_show,
-	       service_capcache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
+static struct orangefs_attribute capcache_timeout_secs_attribute =
 	__ATTR(timeout_secs,
 	       0664,
-	       service_capcache_show,
-	       service_capcache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct attribute *capcache_orangefs_default_attrs[] = {
 	&capcache_hard_limit_attribute.attr,
@@ -1506,34 +928,33 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
 };
 
 static struct kobj_type capcache_orangefs_ktype = {
-	.sysfs_ops = &capcache_orangefs_sysfs_ops,
-	.release = capcache_orangefs_release,
+	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_attrs = capcache_orangefs_default_attrs,
 };
 
-static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
+static struct orangefs_attribute ccache_hard_limit_attribute =
 	__ATTR(hard_limit,
 	       0664,
-	       service_ccache_show,
-	       service_ccache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
+static struct orangefs_attribute ccache_reclaim_percent_attribute =
 	__ATTR(reclaim_percentage,
 	       0664,
-	       service_ccache_show,
-	       service_ccache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
+static struct orangefs_attribute ccache_soft_limit_attribute =
 	__ATTR(soft_limit,
 	       0664,
-	       service_ccache_show,
-	       service_ccache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
+static struct orangefs_attribute ccache_timeout_secs_attribute =
 	__ATTR(timeout_secs,
 	       0664,
-	       service_ccache_show,
-	       service_ccache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct attribute *ccache_orangefs_default_attrs[] = {
 	&ccache_hard_limit_attribute.attr,
@@ -1544,34 +965,33 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
 };
 
 static struct kobj_type ccache_orangefs_ktype = {
-	.sysfs_ops = &ccache_orangefs_sysfs_ops,
-	.release = ccache_orangefs_release,
+	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_attrs = ccache_orangefs_default_attrs,
 };
 
-static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
+static struct orangefs_attribute ncache_hard_limit_attribute =
 	__ATTR(hard_limit,
 	       0664,
-	       service_ncache_show,
-	       service_ncache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
+static struct orangefs_attribute ncache_reclaim_percent_attribute =
 	__ATTR(reclaim_percentage,
 	       0664,
-	       service_ncache_show,
-	       service_ncache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
+static struct orangefs_attribute ncache_soft_limit_attribute =
 	__ATTR(soft_limit,
 	       0664,
-	       service_ncache_show,
-	       service_ncache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
-static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
+static struct orangefs_attribute ncache_timeout_msecs_attribute =
 	__ATTR(timeout_msecs,
 	       0664,
-	       service_ncache_show,
-	       service_ncache_store);
+	       sysfs_service_op_show,
+	       sysfs_service_op_store);
 
 static struct attribute *ncache_orangefs_default_attrs[] = {
 	&ncache_hard_limit_attribute.attr,
@@ -1582,27 +1002,26 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
 };
 
 static struct kobj_type ncache_orangefs_ktype = {
-	.sysfs_ops = &ncache_orangefs_sysfs_ops,
-	.release = ncache_orangefs_release,
+	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_attrs = ncache_orangefs_default_attrs,
 };
 
-static struct pc_orangefs_attribute pc_acache_attribute =
+static struct orangefs_attribute pc_acache_attribute =
 	__ATTR(acache,
 	       0664,
-	       service_pc_show,
+	       sysfs_service_op_show,
 	       NULL);
 
-static struct pc_orangefs_attribute pc_capcache_attribute =
+static struct orangefs_attribute pc_capcache_attribute =
 	__ATTR(capcache,
 	       0664,
-	       service_pc_show,
+	       sysfs_service_op_show,
 	       NULL);
 
-static struct pc_orangefs_attribute pc_ncache_attribute =
+static struct orangefs_attribute pc_ncache_attribute =
 	__ATTR(ncache,
 	       0664,
-	       service_pc_show,
+	       sysfs_service_op_show,
 	       NULL);
 
 static struct attribute *pc_orangefs_default_attrs[] = {
@@ -1613,21 +1032,20 @@ static struct attribute *pc_orangefs_default_attrs[] = {
 };
 
 static struct kobj_type pc_orangefs_ktype = {
-	.sysfs_ops = &pc_orangefs_sysfs_ops,
-	.release = pc_orangefs_release,
+	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_attrs = pc_orangefs_default_attrs,
 };
 
-static struct stats_orangefs_attribute stats_reads_attribute =
+static struct orangefs_attribute stats_reads_attribute =
 	__ATTR(reads,
 	       0664,
-	       int_stats_show,
+	       sysfs_int_show,
 	       NULL);
 
-static struct stats_orangefs_attribute stats_writes_attribute =
+static struct orangefs_attribute stats_writes_attribute =
 	__ATTR(writes,
 	       0664,
-	       int_stats_show,
+	       sysfs_int_show,
 	       NULL);
 
 static struct attribute *stats_orangefs_default_attrs[] = {
@@ -1637,18 +1055,17 @@ static struct attribute *stats_orangefs_default_attrs[] = {
 };
 
 static struct kobj_type stats_orangefs_ktype = {
-	.sysfs_ops = &stats_orangefs_sysfs_ops,
-	.release = stats_orangefs_release,
+	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_attrs = stats_orangefs_default_attrs,
 };
 
-static struct orangefs_obj *orangefs_obj;
-static struct acache_orangefs_obj *acache_orangefs_obj;
-static struct capcache_orangefs_obj *capcache_orangefs_obj;
-static struct ccache_orangefs_obj *ccache_orangefs_obj;
-static struct ncache_orangefs_obj *ncache_orangefs_obj;
-static struct pc_orangefs_obj *pc_orangefs_obj;
-static struct stats_orangefs_obj *stats_orangefs_obj;
+static struct kobject *orangefs_obj;
+static struct kobject *acache_orangefs_obj;
+static struct kobject *capcache_orangefs_obj;
+static struct kobject *ccache_orangefs_obj;
+static struct kobject *ncache_orangefs_obj;
+static struct kobject *pc_orangefs_obj;
+static struct kobject *stats_orangefs_obj;
 
 int orangefs_sysfs_init(void)
 {
@@ -1661,7 +1078,7 @@ int orangefs_sysfs_init(void)
 	if (!orangefs_obj)
 		goto out;
 
-	rc = kobject_init_and_add(&orangefs_obj->kobj,
+	rc = kobject_init_and_add(orangefs_obj,
 				  &orangefs_ktype,
 				  fs_kobj,
 				  ORANGEFS_KOBJ_ID);
@@ -1669,7 +1086,7 @@ int orangefs_sysfs_init(void)
 	if (rc)
 		goto ofs_obj_bail;
 
-	kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(orangefs_obj, KOBJ_ADD);
 
 	/* create /sys/fs/orangefs/acache. */
 	acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
@@ -1678,15 +1095,15 @@ int orangefs_sysfs_init(void)
 		goto ofs_obj_bail;
 	}
 
-	rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
+	rc = kobject_init_and_add(acache_orangefs_obj,
 				  &acache_orangefs_ktype,
-				  &orangefs_obj->kobj,
+				  orangefs_obj,
 				  ACACHE_KOBJ_ID);
 
 	if (rc)
 		goto acache_obj_bail;
 
-	kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(acache_orangefs_obj, KOBJ_ADD);
 
 	/* create /sys/fs/orangefs/capcache. */
 	capcache_orangefs_obj =
@@ -1696,14 +1113,14 @@ int orangefs_sysfs_init(void)
 		goto acache_obj_bail;
 	}
 
-	rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
+	rc = kobject_init_and_add(capcache_orangefs_obj,
 				  &capcache_orangefs_ktype,
-				  &orangefs_obj->kobj,
+				  orangefs_obj,
 				  CAPCACHE_KOBJ_ID);
 	if (rc)
 		goto capcache_obj_bail;
 
-	kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(capcache_orangefs_obj, KOBJ_ADD);
 
 	/* create /sys/fs/orangefs/ccache. */
 	ccache_orangefs_obj =
@@ -1713,14 +1130,14 @@ int orangefs_sysfs_init(void)
 		goto capcache_obj_bail;
 	}
 
-	rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
+	rc = kobject_init_and_add(ccache_orangefs_obj,
 				  &ccache_orangefs_ktype,
-				  &orangefs_obj->kobj,
+				  orangefs_obj,
 				  CCACHE_KOBJ_ID);
 	if (rc)
 		goto ccache_obj_bail;
 
-	kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(ccache_orangefs_obj, KOBJ_ADD);
 
 	/* create /sys/fs/orangefs/ncache. */
 	ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
@@ -1729,15 +1146,15 @@ int orangefs_sysfs_init(void)
 		goto ccache_obj_bail;
 	}
 
-	rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
+	rc = kobject_init_and_add(ncache_orangefs_obj,
 				  &ncache_orangefs_ktype,
-				  &orangefs_obj->kobj,
+				  orangefs_obj,
 				  NCACHE_KOBJ_ID);
 
 	if (rc)
 		goto ncache_obj_bail;
 
-	kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(ncache_orangefs_obj, KOBJ_ADD);
 
 	/* create /sys/fs/orangefs/perf_counters. */
 	pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
@@ -1746,15 +1163,15 @@ int orangefs_sysfs_init(void)
 		goto ncache_obj_bail;
 	}
 
-	rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
+	rc = kobject_init_and_add(pc_orangefs_obj,
 				  &pc_orangefs_ktype,
-				  &orangefs_obj->kobj,
+				  orangefs_obj,
 				  "perf_counters");
 
 	if (rc)
 		goto pc_obj_bail;
 
-	kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(pc_orangefs_obj, KOBJ_ADD);
 
 	/* create /sys/fs/orangefs/stats. */
 	stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
@@ -1763,37 +1180,31 @@ int orangefs_sysfs_init(void)
 		goto pc_obj_bail;
 	}
 
-	rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
+	rc = kobject_init_and_add(stats_orangefs_obj,
 				  &stats_orangefs_ktype,
-				  &orangefs_obj->kobj,
+				  orangefs_obj,
 				  STATS_KOBJ_ID);
 
 	if (rc)
 		goto stats_obj_bail;
 
-	kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
+	kobject_uevent(stats_orangefs_obj, KOBJ_ADD);
 	goto out;
 
 stats_obj_bail:
-		kobject_put(&stats_orangefs_obj->kobj);
-
+		kobject_put(stats_orangefs_obj);
 pc_obj_bail:
-		kobject_put(&pc_orangefs_obj->kobj);
-
+		kobject_put(pc_orangefs_obj);
 ncache_obj_bail:
-		kobject_put(&ncache_orangefs_obj->kobj);
-
+		kobject_put(ncache_orangefs_obj);
 ccache_obj_bail:
-		kobject_put(&ccache_orangefs_obj->kobj);
-
+		kobject_put(ccache_orangefs_obj);
 capcache_obj_bail:
-		kobject_put(&capcache_orangefs_obj->kobj);
-
+		kobject_put(capcache_orangefs_obj);
 acache_obj_bail:
-		kobject_put(&acache_orangefs_obj->kobj);
-
+		kobject_put(acache_orangefs_obj);
 ofs_obj_bail:
-		kobject_put(&orangefs_obj->kobj);
+		kobject_put(orangefs_obj);
 out:
 	return rc;
 }
@@ -1801,13 +1212,11 @@ out:
 void orangefs_sysfs_exit(void)
 {
 	gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
-
-	kobject_put(&acache_orangefs_obj->kobj);
-	kobject_put(&capcache_orangefs_obj->kobj);
-	kobject_put(&ccache_orangefs_obj->kobj);
-	kobject_put(&ncache_orangefs_obj->kobj);
-	kobject_put(&pc_orangefs_obj->kobj);
-	kobject_put(&stats_orangefs_obj->kobj);
-
-	kobject_put(&orangefs_obj->kobj);
+	kobject_put(acache_orangefs_obj);
+	kobject_put(capcache_orangefs_obj);
+	kobject_put(ccache_orangefs_obj);
+	kobject_put(ncache_orangefs_obj);
+	kobject_put(pc_orangefs_obj);
+	kobject_put(stats_orangefs_obj);
+	kobject_put(orangefs_obj);
 }
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index d13c7291fd05..06af81f71e10 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op)
 		case ORANGEFS_VFS_OP_TRUNCATE:
 			fsid = op->upcall.req.truncate.refn.fs_id;
 			break;
-		case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+		case ORANGEFS_VFS_OP_RA_FLUSH:
 			fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
 			break;
 		case ORANGEFS_VFS_OP_FS_UMOUNT:
@@ -347,7 +347,8 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 	inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
 	    orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
 
-	orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
+	orangefs_inode->getattr_time = jiffies +
+	    orangefs_getattr_timeout_msecs*HZ/1000;
 	ret = 0;
 out:
 	op_release(new_op);
@@ -656,401 +657,3 @@ __s32 ORANGEFS_util_translate_mode(int mode)
 	return ret;
 }
 #undef NUM_MODES
-
-/*
- * After obtaining a string representation of the client's debug
- * keywords and their associated masks, this function is called to build an
- * array of these values.
- */
-int orangefs_prepare_cdm_array(char *debug_array_string)
-{
-	int i;
-	int rc = -EINVAL;
-	char *cds_head = NULL;
-	char *cds_delimiter = NULL;
-	int keyword_len = 0;
-
-	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-	/*
-	 * figure out how many elements the cdm_array needs.
-	 */
-	for (i = 0; i < strlen(debug_array_string); i++)
-		if (debug_array_string[i] == '\n')
-			cdm_element_count++;
-
-	if (!cdm_element_count) {
-		pr_info("No elements in client debug array string!\n");
-		goto out;
-	}
-
-	cdm_array =
-		kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
-			GFP_KERNEL);
-	if (!cdm_array) {
-		pr_info("malloc failed for cdm_array!\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	cds_head = debug_array_string;
-
-	for (i = 0; i < cdm_element_count; i++) {
-		cds_delimiter = strchr(cds_head, '\n');
-		*cds_delimiter = '\0';
-
-		keyword_len = strcspn(cds_head, " ");
-
-		cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
-		if (!cdm_array[i].keyword) {
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		sscanf(cds_head,
-		       "%s %llx %llx",
-		       cdm_array[i].keyword,
-		       (unsigned long long *)&(cdm_array[i].mask1),
-		       (unsigned long long *)&(cdm_array[i].mask2));
-
-		if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
-			client_verbose_index = i;
-
-		if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
-			client_all_index = i;
-
-		cds_head = cds_delimiter + 1;
-	}
-
-	rc = cdm_element_count;
-
-	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-
-out:
-
-	return rc;
-
-}
-
-/*
- * /sys/kernel/debug/orangefs/debug-help can be catted to
- * see all the available kernel and client debug keywords.
- *
- * When the kernel boots, we have no idea what keywords the
- * client supports, nor their associated masks.
- *
- * We pass through this function once at boot and stamp a
- * boilerplate "we don't know" message for the client in the
- * debug-help file. We pass through here again when the client
- * starts and then we can fill out the debug-help file fully.
- *
- * The client might be restarted any number of times between
- * reboots, we only build the debug-help file the first time.
- */
-int orangefs_prepare_debugfs_help_string(int at_boot)
-{
-	int rc = -EINVAL;
-	int i;
-	int byte_count = 0;
-	char *client_title = "Client Debug Keywords:\n";
-	char *kernel_title = "Kernel Debug Keywords:\n";
-
-	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-	if (at_boot) {
-		byte_count += strlen(HELP_STRING_UNINITIALIZED);
-		client_title = HELP_STRING_UNINITIALIZED;
-	} else {
-		/*
-		 * fill the client keyword/mask array and remember
-		 * how many elements there were.
-		 */
-		cdm_element_count =
-			orangefs_prepare_cdm_array(client_debug_array_string);
-		if (cdm_element_count <= 0)
-			goto out;
-
-		/* Count the bytes destined for debug_help_string. */
-		byte_count += strlen(client_title);
-
-		for (i = 0; i < cdm_element_count; i++) {
-			byte_count += strlen(cdm_array[i].keyword + 2);
-			if (byte_count >= DEBUG_HELP_STRING_SIZE) {
-				pr_info("%s: overflow 1!\n", __func__);
-				goto out;
-			}
-		}
-
-		gossip_debug(GOSSIP_UTILS_DEBUG,
-			     "%s: cdm_element_count:%d:\n",
-			     __func__,
-			     cdm_element_count);
-	}
-
-	byte_count += strlen(kernel_title);
-	for (i = 0; i < num_kmod_keyword_mask_map; i++) {
-		byte_count +=
-			strlen(s_kmod_keyword_mask_map[i].keyword + 2);
-		if (byte_count >= DEBUG_HELP_STRING_SIZE) {
-			pr_info("%s: overflow 2!\n", __func__);
-			goto out;
-		}
-	}
-
-	/* build debug_help_string. */
-	debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
-	if (!debug_help_string) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	strcat(debug_help_string, client_title);
-
-	if (!at_boot) {
-		for (i = 0; i < cdm_element_count; i++) {
-			strcat(debug_help_string, "\t");
-			strcat(debug_help_string, cdm_array[i].keyword);
-			strcat(debug_help_string, "\n");
-		}
-	}
-
-	strcat(debug_help_string, "\n");
-	strcat(debug_help_string, kernel_title);
-
-	for (i = 0; i < num_kmod_keyword_mask_map; i++) {
-		strcat(debug_help_string, "\t");
-		strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
-		strcat(debug_help_string, "\n");
-	}
-
-	rc = 0;
-
-out:
-
-	return rc;
-
-}
-
-/*
- * kernel = type 0
- * client = type 1
- */
-void debug_mask_to_string(void *mask, int type)
-{
-	int i;
-	int len = 0;
-	char *debug_string;
-	int element_count = 0;
-
-	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-	if (type) {
-		debug_string = client_debug_string;
-		element_count = cdm_element_count;
-	} else {
-		debug_string = kernel_debug_string;
-		element_count = num_kmod_keyword_mask_map;
-	}
-
-	memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
-
-	/*
-	 * Some keywords, like "all" or "verbose", are amalgams of
-	 * numerous other keywords. Make a special check for those
-	 * before grinding through the whole mask only to find out
-	 * later...
-	 */
-	if (check_amalgam_keyword(mask, type))
-		goto out;
-
-	/* Build the debug string. */
-	for (i = 0; i < element_count; i++)
-		if (type)
-			do_c_string(mask, i);
-		else
-			do_k_string(mask, i);
-
-	len = strlen(debug_string);
-
-	if ((len) && (type))
-		client_debug_string[len - 1] = '\0';
-	else if (len)
-		kernel_debug_string[len - 1] = '\0';
-	else if (type)
-		strcpy(client_debug_string, "none");
-	else
-		strcpy(kernel_debug_string, "none");
-
-out:
-gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
-
-	return;
-
-}
-
-void do_k_string(void *k_mask, int index)
-{
-	__u64 *mask = (__u64 *) k_mask;
-
-	if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
-		goto out;
-
-	if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
-		if ((strlen(kernel_debug_string) +
-		     strlen(s_kmod_keyword_mask_map[index].keyword))
-			< ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
-				strcat(kernel_debug_string,
-				       s_kmod_keyword_mask_map[index].keyword);
-				strcat(kernel_debug_string, ",");
-			} else {
-				gossip_err("%s: overflow!\n", __func__);
-				strcpy(kernel_debug_string, ORANGEFS_ALL);
-				goto out;
-			}
-	}
-
-out:
-
-	return;
-}
-
-void do_c_string(void *c_mask, int index)
-{
-	struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
-
-	if (keyword_is_amalgam(cdm_array[index].keyword))
-		goto out;
-
-	if ((mask->mask1 & cdm_array[index].mask1) ||
-	    (mask->mask2 & cdm_array[index].mask2)) {
-		if ((strlen(client_debug_string) +
-		     strlen(cdm_array[index].keyword) + 1)
-			< ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
-				strcat(client_debug_string,
-				       cdm_array[index].keyword);
-				strcat(client_debug_string, ",");
-			} else {
-				gossip_err("%s: overflow!\n", __func__);
-				strcpy(client_debug_string, ORANGEFS_ALL);
-				goto out;
-			}
-	}
-out:
-	return;
-}
-
-int keyword_is_amalgam(char *keyword)
-{
-	int rc = 0;
-
-	if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
-		rc = 1;
-
-	return rc;
-}
-
-/*
- * kernel = type 0
- * client = type 1
- *
- * return 1 if we found an amalgam.
- */
-int check_amalgam_keyword(void *mask, int type)
-{
-	__u64 *k_mask;
-	struct client_debug_mask *c_mask;
-	int k_all_index = num_kmod_keyword_mask_map - 1;
-	int rc = 0;
-
-	if (type) {
-		c_mask = (struct client_debug_mask *) mask;
-
-		if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
-		    (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
-			strcpy(client_debug_string, ORANGEFS_ALL);
-			rc = 1;
-			goto out;
-		}
-
-		if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
-		    (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
-			strcpy(client_debug_string, ORANGEFS_VERBOSE);
-			rc = 1;
-			goto out;
-		}
-
-	} else {
-		k_mask = (__u64 *) mask;
-
-		if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
-			strcpy(kernel_debug_string, ORANGEFS_ALL);
-			rc = 1;
-			goto out;
-		}
-	}
-
-out:
-
-	return rc;
-}
-
-/*
- * kernel = type 0
- * client = type 1
- */
-void debug_string_to_mask(char *debug_string, void *mask, int type)
-{
-	char *unchecked_keyword;
-	int i;
-	char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
-	char *original_pointer;
-	int element_count = 0;
-	struct client_debug_mask *c_mask;
-	__u64 *k_mask;
-
-	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-	if (type) {
-		c_mask = (struct client_debug_mask *)mask;
-		element_count = cdm_element_count;
-	} else {
-		k_mask = (__u64 *)mask;
-		*k_mask = 0;
-		element_count = num_kmod_keyword_mask_map;
-	}
-
-	original_pointer = strsep_fodder;
-	while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
-		if (strlen(unchecked_keyword)) {
-			for (i = 0; i < element_count; i++)
-				if (type)
-					do_c_mask(i,
-						  unchecked_keyword,
-						  &c_mask);
-				else
-					do_k_mask(i,
-						  unchecked_keyword,
-						  &k_mask);
-		}
-
-	kfree(original_pointer);
-}
-
-void do_c_mask(int i,
-	       char *unchecked_keyword,
-	       struct client_debug_mask **sane_mask)
-{
-
-	if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
-		(**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
-		(**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
-	}
-}
-
-void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
-{
-
-	if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
-		**sane_mask = (**sane_mask) |
-				s_kmod_keyword_mask_map[i].mask_val;
-}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 3d7418c728f5..971307ad69be 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -4,26 +4,6 @@
 #include <linux/slab.h>
 #include <linux/ioctl.h>
 
-extern struct client_debug_mask *cdm_array;
-extern char *debug_help_string;
-extern int help_string_initialized;
-extern struct dentry *debug_dir;
-extern struct dentry *help_file_dentry;
-extern struct dentry *client_debug_dentry;
-extern const struct file_operations debug_help_fops;
-extern int client_all_index;
-extern int client_verbose_index;
-extern int cdm_element_count;
-#define DEBUG_HELP_STRING_SIZE 4096
-#define HELP_STRING_UNINITIALIZED \
-	"Client Debug Keywords are unknown until the first time\n" \
-	"the client is started after boot.\n"
-#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
-#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
-#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
-#define ORANGEFS_VERBOSE "verbose"
-#define ORANGEFS_ALL "all"
-
 /* pvfs2-config.h ***********************************************************/
 #define ORANGEFS_VERSION_MAJOR 2
 #define ORANGEFS_VERSION_MINOR 9
@@ -426,13 +406,12 @@ do {									\
 		printk(KERN_DEBUG fmt, ##__VA_ARGS__);			\
 } while (0)
 #else
-extern __u64 gossip_debug_mask;
-extern struct client_debug_mask client_debug_mask;
+extern __u64 orangefs_gossip_debug_mask;
 
 /* try to avoid function call overhead by checking masks in macro */
 #define gossip_debug(mask, fmt, ...)					\
 do {									\
-	if (gossip_debug_mask & (mask))					\
+	if (orangefs_gossip_debug_mask & (mask))			\
 		printk(KERN_DEBUG fmt, ##__VA_ARGS__);			\
 } while (0)
 #endif /* GOSSIP_DISABLE_DEBUG */
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index b9da9a0281c9..c48859f16e7b 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -33,6 +33,7 @@ static const match_table_t tokens = {
 	{ Opt_err,	NULL }
 };
 
+uint64_t orangefs_features;
 
 static int parse_mount_options(struct super_block *sb, char *options,
 		int silent)
@@ -249,6 +250,19 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
 	}
 
 	op_release(new_op);
+
+	if (orangefs_userspace_version >= 20906) {
+		new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
+		if (!new_op)
+			return -ENOMEM;
+		new_op->upcall.req.features.features = 0;
+		ret = service_operation(new_op, "orangefs_features", 0);
+		orangefs_features = new_op->downcall.resp.features.features;
+		op_release(new_op);
+	} else {
+		orangefs_features = 0;
+	}
+
 	return ret;
 }
 
@@ -492,6 +506,19 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 	list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
 	spin_unlock(&orangefs_superblocks_lock);
 	op_release(new_op);
+
+	if (orangefs_userspace_version >= 20906) {
+		new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
+		if (!new_op)
+			return ERR_PTR(-ENOMEM);
+		new_op->upcall.req.features.features = 0;
+		ret = service_operation(new_op, "orangefs_features", 0);
+		orangefs_features = new_op->downcall.resp.features.features;
+		op_release(new_op);
+	} else {
+		orangefs_features = 0;
+	}
+
 	return dget(sb->s_root);
 
 free_op:
@@ -530,8 +557,8 @@ void orangefs_kill_sb(struct super_block *sb)
 	 * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
 	 * gets completed before we free the dang thing.
 	 */
-	mutex_lock(&request_mutex);
-	mutex_unlock(&request_mutex);
+	mutex_lock(&orangefs_request_mutex);
+	mutex_unlock(&orangefs_request_mutex);
 
 	/* free the orangefs superblock private data */
 	kfree(ORANGEFS_SB(sb));
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
index 001b20239407..af0b0e36d559 100644
--- a/fs/orangefs/upcall.h
+++ b/fs/orangefs/upcall.h
@@ -98,7 +98,7 @@ struct orangefs_truncate_request_s {
 	__s64 size;
 };
 
-struct orangefs_mmap_ra_cache_flush_request_s {
+struct orangefs_ra_cache_flush_request_s {
 	struct orangefs_object_kref refn;
 };
 
@@ -179,12 +179,18 @@ enum orangefs_param_request_op {
 	ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
 	ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
 	ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+	ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26,
+	ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27,
+	ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28,
 };
 
 struct orangefs_param_request_s {
 	enum orangefs_param_request_type type;
 	enum orangefs_param_request_op op;
-	__s64 value;
+	union {
+		__s64 value64;
+		__s32 value32[2];
+	} u;
 	char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
 };
 
@@ -204,6 +210,11 @@ struct orangefs_fs_key_request_s {
 	__s32 __pad1;
 };
 
+/* 2.9.6 */
+struct orangefs_features_request_s {
+	__u64 features;
+};
+
 struct orangefs_upcall_s {
 	__s32 type;
 	__u32 uid;
@@ -228,7 +239,7 @@ struct orangefs_upcall_s {
 		struct orangefs_rename_request_s rename;
 		struct orangefs_statfs_request_s statfs;
 		struct orangefs_truncate_request_s truncate;
-		struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+		struct orangefs_ra_cache_flush_request_s ra_cache_flush;
 		struct orangefs_fs_mount_request_s fs_mount;
 		struct orangefs_fs_umount_request_s fs_umount;
 		struct orangefs_getxattr_request_s getxattr;
@@ -240,6 +251,7 @@ struct orangefs_upcall_s {
 		struct orangefs_param_request_s param;
 		struct orangefs_perf_count_request_s perf_count;
 		struct orangefs_fs_key_request_s fs_key;
+		struct orangefs_features_request_s features;
 	} req;
 };
 
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 31635bc303fe..abcfa3fa9992 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -87,9 +87,9 @@ retry_servicing:
 	 */
 	if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
 		if (flags & ORANGEFS_OP_INTERRUPTIBLE)
-			ret = mutex_lock_interruptible(&request_mutex);
+			ret = mutex_lock_interruptible(&orangefs_request_mutex);
 		else
-			ret = mutex_lock_killable(&request_mutex);
+			ret = mutex_lock_killable(&orangefs_request_mutex);
 		/*
 		 * check to see if we were interrupted while waiting for
 		 * mutex
@@ -129,7 +129,7 @@ retry_servicing:
 	spin_unlock(&orangefs_request_list_lock);
 
 	if (!(flags & ORANGEFS_OP_NO_MUTEX))
-		mutex_unlock(&request_mutex);
+		mutex_unlock(&orangefs_request_mutex);
 
 	ret = wait_for_matching_downcall(op, timeout,
 					 flags & ORANGEFS_OP_INTERRUPTIBLE);
@@ -272,9 +272,9 @@ static void
 	} else if (op_state_in_progress(op)) {
 		/* op must be removed from the in progress htable */
 		spin_unlock(&op->lock);
-		spin_lock(&htable_ops_in_progress_lock);
+		spin_lock(&orangefs_htable_ops_in_progress_lock);
 		list_del_init(&op->list);
-		spin_unlock(&htable_ops_in_progress_lock);
+		spin_unlock(&orangefs_htable_ops_in_progress_lock);
 		gossip_debug(GOSSIP_WAIT_DEBUG,
 			     "Interrupted: Removed op %p"
 			     " from htable_ops_in_progress\n",
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 43fdc2765aea..db37a0e02d32 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -105,6 +105,13 @@ retry:
 			goto retry;
 		}
 
+		error = security_inode_copy_up_xattr(name);
+		if (error < 0 && error != -EOPNOTSUPP)
+			break;
+		if (error == 1) {
+			error = 0;
+			continue; /* Discard */
+		}
 		error = vfs_setxattr(new, name, value, size, 0);
 		if (error)
 			break;
@@ -248,6 +255,8 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	struct dentry *upper = NULL;
 	umode_t mode = stat->mode;
 	int err;
+	const struct cred *old_creds = NULL;
+	struct cred *new_creds = NULL;
 
 	newdentry = ovl_lookup_temp(workdir, dentry);
 	err = PTR_ERR(newdentry);
@@ -260,10 +269,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	if (IS_ERR(upper))
 		goto out1;
 
+	err = security_inode_copy_up(dentry, &new_creds);
+	if (err < 0)
+		goto out2;
+
+	if (new_creds)
+		old_creds = override_creds(new_creds);
+
 	/* Can't properly set mode on creation because of the umask */
 	stat->mode &= S_IFMT;
 	err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
 	stat->mode = mode;
+
+	if (new_creds) {
+		revert_creds(old_creds);
+		put_cred(new_creds);
+	}
+
 	if (err)
 		goto out2;
 
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 1560fdc09a5f..b0ffa1d1677e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -489,6 +489,15 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 	if (override_cred) {
 		override_cred->fsuid = inode->i_uid;
 		override_cred->fsgid = inode->i_gid;
+		if (!hardlink) {
+			err = security_dentry_create_files_as(dentry,
+					stat->mode, &dentry->d_name, old_cred,
+					override_cred);
+			if (err) {
+				put_cred(override_cred);
+				goto out_revert_creds;
+			}
+		}
 		put_cred(override_creds(override_cred));
 		put_cred(override_cred);
 
@@ -499,6 +508,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 			err = ovl_create_over_whiteout(dentry, inode, stat,
 							link, hardlink);
 	}
+out_revert_creds:
 	revert_creds(old_cred);
 	if (!err) {
 		struct inode *realinode = d_inode(ovl_dentry_upper(dentry));
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a4585f961bf9..e2a94a26767b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -835,11 +835,11 @@ retry:
 			goto out_dput;
 
 		err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
-		if (err && err != -ENODATA)
+		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
 		err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
-		if (err && err != -ENODATA)
+		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
 		/* Clear any inherited mode bits */
diff --git a/fs/pnode.c b/fs/pnode.c
index 99899705b105..234a9ac49958 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -259,7 +259,7 @@ static int propagate_one(struct mount *m)
 		read_sequnlock_excl(&mount_lock);
 	}
 	hlist_add_head(&child->mnt_hash, list);
-	return 0;
+	return count_mounts(m->mnt_ns, child);
 }
 
 /*
diff --git a/fs/pnode.h b/fs/pnode.h
index 0fcdbe7ca648..550f5a8b4fcf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
 #endif /* _LINUX_PNODE_H */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ac0df4dde823..3b792ab3c0dc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 		save_stack_trace_tsk(task, &trace);
 
 		for (i = 0; i < trace.nr_entries; i++) {
-			seq_printf(m, "[<%pK>] %pS\n",
+			seq_printf(m, "[<%pK>] %pB\n",
 				   (void *)entries[i], (void *)entries[i]);
 		}
 		unlock_trace(task);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index c633476616e0..bca66d83a765 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -390,6 +390,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	atomic_set(&ent->count, 1);
 	spin_lock_init(&ent->pde_unload_lock);
 	INIT_LIST_HEAD(&ent->pde_openers);
+	proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+
 out:
 	return ent;
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..5c89a07e3d7f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
+	char *buf = file->private_data;
 	ssize_t acc = 0;
 	size_t size, tsz;
 	size_t elf_buflen;
@@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
 		} else if (is_vmalloc_or_module_addr((void *)start)) {
-			char * elf_buf;
-
-			elf_buf = kzalloc(tsz, GFP_KERNEL);
-			if (!elf_buf)
-				return -ENOMEM;
-			vread(elf_buf, (char *)start, tsz);
+			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
-			if (copy_to_user(buffer, elf_buf, tsz)) {
-				kfree(elf_buf);
+			if (copy_to_user(buffer, buf, tsz))
 				return -EFAULT;
-			}
-			kfree(elf_buf);
 		} else {
 			if (kern_addr_valid(start)) {
 				unsigned long n;
 
-				n = copy_to_user(buffer, (char *)start, tsz);
+				/*
+				 * Using bounce buffer to bypass the
+				 * hardened user copy kernel text checks.
+				 */
+				memcpy(buf, (char *) start, tsz);
+				n = copy_to_user(buffer, buf, tsz);
 				/*
 				 * We cannot distinguish between fault on source
 				 * and fault on destination. When this happens
@@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
 {
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
+
+	filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!filp->private_data)
+		return -ENOMEM;
+
 	if (kcore_need_update)
 		kcore_update_ram();
 	if (i_size_read(inode) != proc_root_kcore->size) {
@@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int release_kcore(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
 
 static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
+	.release	= release_kcore,
 	.llseek		= default_llseek,
 };
 
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index c8bbc68cdb05..7ae6b1da7cab 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -21,6 +21,7 @@
 #include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
+#include <linux/uidgid.h>
 #include <net/net_namespace.h>
 #include <linux/seq_file.h>
 
@@ -185,6 +186,8 @@ const struct file_operations proc_net_operations = {
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *netd, *net_statd;
+	kuid_t uid;
+	kgid_t gid;
 	int err;
 
 	err = -ENOMEM;
@@ -199,6 +202,16 @@ static __net_init int proc_net_ns_init(struct net *net)
 	netd->parent = &proc_root;
 	memcpy(netd->name, "net", 4);
 
+	uid = make_kuid(net->user_ns, 0);
+	if (!uid_valid(uid))
+		uid = netd->uid;
+
+	gid = make_kgid(net->user_ns, 0);
+	if (!gid_valid(gid))
+		gid = netd->gid;
+
+	proc_set_user(netd, uid, gid);
+
 	err = -EEXIST;
 	net_statd = proc_net_mkdir(net, "stat", netd);
 	if (!net_statd)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1b93650dda2f..71025b9e2a4e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
 
 static void drop_sysctl_table(struct ctl_table_header *header);
 static int sysctl_follow_link(struct ctl_table_header **phead,
-	struct ctl_table **pentry, struct nsproxy *namespaces);
+	struct ctl_table **pentry);
 static int insert_links(struct ctl_table_header *head);
 static void put_links(struct ctl_table_header *header);
 
@@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head)
 }
 
 static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
 {
 	struct ctl_table_set *set = &root->default_set;
 	if (root->lookup)
-		set = root->lookup(root, namespaces);
+		set = root->lookup(root);
 	return set;
 }
 
@@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
 static struct inode *proc_sys_make_inode(struct super_block *sb,
 		struct ctl_table_header *head, struct ctl_table *table)
 {
+	struct ctl_table_root *root = head->root;
 	struct inode *inode;
 	struct proc_inode *ei;
 
@@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 		if (is_empty_dir(head))
 			make_empty_dir_inode(inode);
 	}
+
+	if (root->set_ownership)
+		root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+
 out:
 	return inode;
 }
@@ -491,7 +496,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	if (S_ISLNK(p->mode)) {
-		ret = sysctl_follow_link(&h, &p, current->nsproxy);
+		ret = sysctl_follow_link(&h, &p);
 		err = ERR_PTR(ret);
 		if (ret)
 			goto out;
@@ -659,7 +664,7 @@ static bool proc_sys_link_fill_cache(struct file *file,
 
 	if (S_ISLNK(table->mode)) {
 		/* It is not an error if we can not follow the link ignore it */
-		int err = sysctl_follow_link(&head, &table, current->nsproxy);
+		int err = sysctl_follow_link(&head, &table);
 		if (err)
 			goto out;
 	}
@@ -976,7 +981,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
 }
 
 static int sysctl_follow_link(struct ctl_table_header **phead,
-	struct ctl_table **pentry, struct nsproxy *namespaces)
+	struct ctl_table **pentry)
 {
 	struct ctl_table_header *head;
 	struct ctl_table_root *root;
@@ -988,7 +993,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
 	ret = 0;
 	spin_lock(&sysctl_lock);
 	root = (*pentry)->data;
-	set = lookup_header_set(root, namespaces);
+	set = lookup_header_set(root);
 	dir = xlate_dir(set, (*phead)->parent);
 	if (IS_ERR(dir))
 		ret = PTR_ERR(dir);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84ef9de9..f6fa99eca515 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		mss->anonymous_thp += HPAGE_PMD_SIZE;
 	else if (PageSwapBacked(page))
 		mss->shmem_thp += HPAGE_PMD_SIZE;
+	else if (is_zone_device_page(page))
+		/* pass */;
 	else
 		VM_BUG_ON_PAGE(1, page);
 	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 35df08ee9c97..2d445425aad7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -341,6 +341,7 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
 	struct qc_state state;
 	int ret;
 
+	memset(&state, 0, sizeof (struct qc_state));
 	ret = sb->s_qcop->get_state(sb, &state);
 	if (ret < 0)
 		return ret;
@@ -365,17 +366,19 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
 	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
 	fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
 	fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
-	if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+
+	/* Inodes may be allocated even if inactive; copy out if present */
+	if (state.s_state[USRQUOTA].ino) {
 		fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
 		fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
 		fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
 	}
-	if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+	if (state.s_state[GRPQUOTA].ino) {
 		fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
 		fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
 		fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
 	}
-	if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+	if (state.s_state[PRJQUOTA].ino) {
 		/*
 		 * Q_XGETQSTAT doesn't have room for both group and project
 		 * quotas.  So, allow the project quota values to be copied out
@@ -411,6 +414,7 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
 	struct qc_state state;
 	int ret;
 
+	memset(&state, 0, sizeof (struct qc_state));
 	ret = sb->s_qcop->get_state(sb, &state);
 	if (ret < 0)
 		return ret;
@@ -435,17 +439,19 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
 	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
 	fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
 	fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
-	if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+
+	/* Inodes may be allocated even if inactive; copy out if present */
+	if (state.s_state[USRQUOTA].ino) {
 		fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
 		fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
 		fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
 	}
-	if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+	if (state.s_state[GRPQUOTA].ino) {
 		fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
 		fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
 		fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
 	}
-	if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+	if (state.s_state[PRJQUOTA].ino) {
 		fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino;
 		fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks;
 		fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents;
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 183a212694bf..12af0490322f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -27,9 +27,17 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/ramfs.h>
+#include <linux/sched.h>
 
 #include "internal.h"
 
+static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
 const struct file_operations ramfs_file_operations = {
 	.read_iter	= generic_file_read_iter,
 	.write_iter	= generic_file_write_iter,
@@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.llseek		= generic_file_llseek,
+	.get_unmapped_area	= ramfs_mmu_get_unmapped_area,
 };
 
 const struct inode_operations ramfs_file_inode_operations = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7a4a85a6821e..74d5ddd26296 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s,
 static int reiserfs_quota_on_mount(struct super_block *, int);
 #endif
 
-/* look for uncompleted unlinks and truncates and complete them */
+/*
+ * Look for uncompleted unlinks and truncates and complete them
+ *
+ * Called with superblock write locked.  If quotas are enabled, we have to
+ * release/retake lest we call dquot_quota_on_mount(), proceed to
+ * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
+ * cpu worklets to complete flush_async_commits() that in turn wait for the
+ * superblock write lock.
+ */
 static int finish_unfinished(struct super_block *s)
 {
 	INITIALIZE_PATH(path);
@@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s)
 				quota_enabled[i] = 0;
 				continue;
 			}
+			reiserfs_write_unlock(s);
 			ret = reiserfs_quota_on_mount(s, i);
+			reiserfs_write_lock(s);
 			if (ret < 0)
 				reiserfs_warning(s, "reiserfs-2500",
 						 "cannot turn on journaled "
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index dc1358b5ec95..ac2de0ed69ad 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -233,8 +233,8 @@ void sysfs_remove_group(struct kobject *kobj,
 		kn = kernfs_find_and_get(parent, grp->name);
 		if (!kn) {
 			WARN(!kn, KERN_WARNING
-			     "sysfs group %p not found for kobject '%s'\n",
-			     grp, kobject_name(kobj));
+			     "sysfs group '%s' not found for kobject '%s'\n",
+			     grp->name, kobject_name(kobj));
 			return;
 		}
 	} else {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 632570617327..e855bf8d74b4 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -94,7 +94,7 @@ static int udf_adinicb_write_begin(struct file *file,
 		return -ENOMEM;
 	*pagep = page;
 
-	if (!PageUptodate(page) && len != PAGE_SIZE)
+	if (!PageUptodate(page))
 		__udf_adinicb_readpage(page);
 	return 0;
 }
@@ -105,11 +105,25 @@ static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return 0;
 }
 
+static int udf_adinicb_write_end(struct file *file, struct address_space *mapping,
+				 loff_t pos, unsigned len, unsigned copied,
+				 struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t last_pos = pos + copied;
+	if (last_pos > inode->i_size)
+		i_size_write(inode, last_pos);
+	set_page_dirty(page);
+	unlock_page(page);
+	put_page(page);
+	return copied;
+}
+
 const struct address_space_operations udf_adinicb_aops = {
 	.readpage	= udf_adinicb_readpage,
 	.writepage	= udf_adinicb_writepage,
 	.write_begin	= udf_adinicb_write_begin,
-	.write_end	= simple_write_end,
+	.write_end	= udf_adinicb_write_end,
 	.direct_IO	= udf_adinicb_direct_IO,
 };
 
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fc593c869493..584e87e11cb6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -52,6 +52,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_inode_fork.o \
 				   xfs_inode_buf.o \
 				   xfs_log_rlimit.o \
+				   xfs_ag_resv.o \
 				   xfs_rmap.o \
 				   xfs_rmap_btree.o \
 				   xfs_sb.o \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000000..e3ae0f2b4294
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG.  However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion.  Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior:  First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken.  Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting.  Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations.  Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions.  In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function.  It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction.  The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks.  Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks?  For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	xfs_extlen_t			avail;
+	xfs_extlen_t			orig;
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+		avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+		orig = pag->pag_meta_resv.ar_asked;
+		break;
+	case XFS_AG_RESV_AGFL:
+		avail = pag->pagf_freeblks + pag->pagf_flcount -
+			pag->pag_meta_resv.ar_reserved;
+		orig = pag->pag_agfl_resv.ar_asked;
+		break;
+	default:
+		ASSERT(0);
+		return false;
+	}
+
+	trace_xfs_ag_resv_critical(pag, type, avail);
+
+	/* Critically low if less than 10% or max btree height remains. */
+	return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	xfs_extlen_t			len;
+
+	len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_AGFL:
+		len -= xfs_perag_resv(pag, type)->ar_reserved;
+		break;
+	case XFS_AG_RESV_NONE:
+		/* empty */
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	trace_xfs_ag_resv_needed(pag, type, len);
+
+	return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	struct xfs_ag_resv		*resv;
+	xfs_extlen_t			oldresv;
+	int				error;
+
+	trace_xfs_ag_resv_free(pag, type, 0);
+
+	resv = xfs_perag_resv(pag, type);
+	pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+	/*
+	 * AGFL blocks are always considered "free", so whatever
+	 * was reserved at mount time must be given back at umount.
+	 */
+	if (type == XFS_AG_RESV_AGFL)
+		oldresv = resv->ar_orig_reserved;
+	else
+		oldresv = resv->ar_reserved;
+	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+	resv->ar_reserved = 0;
+	resv->ar_asked = 0;
+
+	if (error)
+		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+				error, _RET_IP_);
+	return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+	struct xfs_perag		*pag)
+{
+	int				error;
+	int				err2;
+
+	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+	if (err2 && !error)
+		error = err2;
+	return error;
+}
+
+static int
+__xfs_ag_resv_init(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	xfs_extlen_t			ask,
+	xfs_extlen_t			used)
+{
+	struct xfs_mount		*mp = pag->pag_mount;
+	struct xfs_ag_resv		*resv;
+	int				error;
+
+	resv = xfs_perag_resv(pag, type);
+	if (used > ask)
+		ask = used;
+	resv->ar_asked = ask;
+	resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+	mp->m_ag_max_usable -= ask;
+
+	trace_xfs_ag_resv_init(pag, type, ask);
+
+	error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+	if (error)
+		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+				error, _RET_IP_);
+
+	return error;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+	struct xfs_perag		*pag)
+{
+	xfs_extlen_t			ask;
+	xfs_extlen_t			used;
+	int				error = 0;
+
+	/* Create the metadata reservation. */
+	if (pag->pag_meta_resv.ar_asked == 0) {
+		ask = used = 0;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+				ask, used);
+		if (error)
+			goto out;
+	}
+
+	/* Create the AGFL metadata reservation */
+	if (pag->pag_agfl_resv.ar_asked == 0) {
+		ask = used = 0;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+		if (error)
+			goto out;
+	}
+
+out:
+	return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	struct xfs_alloc_arg		*args)
+{
+	struct xfs_ag_resv		*resv;
+	xfs_extlen_t			len;
+	uint				field;
+
+	trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_AGFL:
+		resv = xfs_perag_resv(pag, type);
+		break;
+	default:
+		ASSERT(0);
+		/* fall through */
+	case XFS_AG_RESV_NONE:
+		field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+				       XFS_TRANS_SB_FDBLOCKS;
+		xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+		return;
+	}
+
+	len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+	resv->ar_reserved -= len;
+	if (type == XFS_AG_RESV_AGFL)
+		return;
+	/* Allocations of reserved blocks only need on-disk sb updates... */
+	xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+	/* ...but non-reserved blocks need in-core and on-disk updates. */
+	if (args->len > len)
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+				-((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	struct xfs_trans		*tp,
+	xfs_extlen_t			len)
+{
+	xfs_extlen_t			leftover;
+	struct xfs_ag_resv		*resv;
+
+	trace_xfs_ag_resv_free_extent(pag, type, len);
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_AGFL:
+		resv = xfs_perag_resv(pag, type);
+		break;
+	default:
+		ASSERT(0);
+		/* fall through */
+	case XFS_AG_RESV_NONE:
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+		return;
+	}
+
+	leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+	resv->ar_reserved += leftover;
+	if (type == XFS_AG_RESV_AGFL)
+		return;
+	/* Freeing into the reserved pool only requires on-disk update... */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+	/* ...but freeing beyond that requires in-core and on-disk update. */
+	if (len > leftover)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000000..8d6c687deef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_AG_RESV_H__
+#define	__XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+		enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+		struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+		struct xfs_trans *tp, xfs_extlen_t len);
+
+#endif	/* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 05b5243d89f6..ca75dc90ebe0 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
+#include "xfs_ag_resv.h"
 
 struct workqueue_struct *xfs_alloc_wq;
 
@@ -74,14 +75,8 @@ xfs_prealloc_blocks(
  * extents need to be actually allocated. To get around this, we explicitly set
  * aside a few blocks which will not be reserved in delayed allocation.
  *
- * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
- * and 4 more to handle a potential split of the file's bmap btree.
- *
- * When rmap is enabled, we must also be able to handle two rmap btree inserts
- * to record both the file data extent and a new bmbt block.  The bmbt block
- * might not be in the same AG as the file data extent.  In the worst case
- * the bmap btree splits multiple levels and all the new blocks come from
- * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
+ * potential split of the file's bmap btree.
  */
 unsigned int
 xfs_alloc_set_aside(
@@ -90,8 +85,6 @@ xfs_alloc_set_aside(
 	unsigned int		blocks;
 
 	blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
-		blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
 	return blocks;
 }
 
@@ -265,7 +258,7 @@ xfs_alloc_compute_diff(
 	xfs_agblock_t	wantbno,	/* target starting block */
 	xfs_extlen_t	wantlen,	/* target length */
 	xfs_extlen_t	alignment,	/* target alignment */
-	char		userdata,	/* are we allocating data? */
+	int		datatype,	/* are we allocating data? */
 	xfs_agblock_t	freebno,	/* freespace's starting block */
 	xfs_extlen_t	freelen,	/* freespace's length */
 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
@@ -276,6 +269,7 @@ xfs_alloc_compute_diff(
 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
 	xfs_agblock_t	wantend;	/* end of target extent */
+	bool		userdata = xfs_alloc_is_userdata(datatype);
 
 	ASSERT(freelen >= wantlen);
 	freeend = freebno + freelen;
@@ -680,12 +674,29 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
+	xfs_extlen_t	reservation;
+	xfs_extlen_t	oldmax;
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
 	ASSERT(args->minlen <= args->maxlen);
 	ASSERT(args->mod < args->prod);
 	ASSERT(args->alignment > 0);
+
+	/*
+	 * Clamp maxlen to the amount of free space minus any reservations
+	 * that have been made.
+	 */
+	oldmax = args->maxlen;
+	reservation = xfs_ag_resv_needed(args->pag, args->resv);
+	if (args->maxlen > args->pag->pagf_freeblks - reservation)
+		args->maxlen = args->pag->pagf_freeblks - reservation;
+	if (args->maxlen == 0) {
+		args->agbno = NULLAGBLOCK;
+		args->maxlen = oldmax;
+		return 0;
+	}
+
 	/*
 	 * Branch to correct routine based on the type.
 	 */
@@ -705,12 +716,14 @@ xfs_alloc_ag_vextent(
 		/* NOTREACHED */
 	}
 
+	args->maxlen = oldmax;
+
 	if (error || args->agbno == NULLAGBLOCK)
 		return error;
 
 	ASSERT(args->len >= args->minlen);
 	ASSERT(args->len <= args->maxlen);
-	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
 	ASSERT(args->agbno % args->alignment == 0);
 
 	/* if not file data, insert new block into the reverse map btree */
@@ -732,12 +745,7 @@ xfs_alloc_ag_vextent(
 					      args->agbno, args->len));
 	}
 
-	if (!args->isfl) {
-		xfs_trans_mod_sb(args->tp, args->wasdel ?
-				 XFS_TRANS_SB_RES_FDBLOCKS :
-				 XFS_TRANS_SB_FDBLOCKS,
-				 -((long)(args->len)));
-	}
+	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
 
 	XFS_STATS_INC(args->mp, xs_allocx);
 	XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -917,7 +925,7 @@ xfs_alloc_find_best_extent(
 
 			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
 						       args->alignment,
-						       args->userdata, *sbnoa,
+						       args->datatype, *sbnoa,
 						       *slena, &new);
 
 			/*
@@ -1101,7 +1109,7 @@ restart:
 			if (args->len < blen)
 				continue;
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, args->userdata, ltbnoa,
+				args->alignment, args->datatype, ltbnoa,
 				ltlena, &ltnew);
 			if (ltnew != NULLAGBLOCK &&
 			    (args->len > blen || ltdiff < bdiff)) {
@@ -1254,7 +1262,7 @@ restart:
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, args->userdata, ltbnoa,
+				args->alignment, args->datatype, ltbnoa,
 				ltlena, &ltnew);
 
 			error = xfs_alloc_find_best_extent(args,
@@ -1271,7 +1279,7 @@ restart:
 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, args->userdata, gtbnoa,
+				args->alignment, args->datatype, gtbnoa,
 				gtlena, &gtnew);
 
 			error = xfs_alloc_find_best_extent(args,
@@ -1331,7 +1339,7 @@ restart:
 	}
 	rlen = args->len;
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-				     args->userdata, ltbnoa, ltlena, &ltnew);
+				     args->datatype, ltbnoa, ltlena, &ltnew);
 	ASSERT(ltnew >= ltbno);
 	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
 	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1583,6 +1591,7 @@ xfs_alloc_ag_vextent_small(
 	int		*stat)	/* status: 0-freelist, 1-normal/none */
 {
 	struct xfs_owner_info	oinfo;
+	struct xfs_perag	*pag;
 	int		error;
 	xfs_agblock_t	fbno;
 	xfs_extlen_t	flen;
@@ -1600,7 +1609,8 @@ xfs_alloc_ag_vextent_small(
 	 * to respect minleft even when pulling from the
 	 * freelist.
 	 */
-	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+	else if (args->minlen == 1 && args->alignment == 1 &&
+		 args->resv != XFS_AG_RESV_AGFL &&
 		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
 		  > args->minleft)) {
 		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1608,9 +1618,9 @@ xfs_alloc_ag_vextent_small(
 			goto error0;
 		if (fbno != NULLAGBLOCK) {
 			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
-					     args->userdata);
+			      xfs_alloc_allow_busy_reuse(args->datatype));
 
-			if (args->userdata) {
+			if (xfs_alloc_is_userdata(args->datatype)) {
 				xfs_buf_t	*bp;
 
 				bp = xfs_btree_get_bufs(args->mp, args->tp,
@@ -1629,13 +1639,18 @@ xfs_alloc_ag_vextent_small(
 			/*
 			 * If we're feeding an AGFL block to something that
 			 * doesn't live in the free space, we need to clear
-			 * out the OWN_AG rmap.
+			 * out the OWN_AG rmap and add the block back to
+			 * the AGFL per-AG reservation.
 			 */
 			xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
 			error = xfs_rmap_free(args->tp, args->agbp, args->agno,
 					fbno, 1, &oinfo);
 			if (error)
 				goto error0;
+			pag = xfs_perag_get(args->mp, args->agno);
+			xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
+					args->tp, 1);
+			xfs_perag_put(pag);
 
 			*stat = 0;
 			return 0;
@@ -1683,7 +1698,7 @@ xfs_free_ag_extent(
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len,
 	struct xfs_owner_info	*oinfo,
-	int			isfl)
+	enum xfs_ag_resv_type	type)
 {
 	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
 	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
@@ -1911,21 +1926,22 @@ xfs_free_ag_extent(
 	 */
 	pag = xfs_perag_get(mp, agno);
 	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_ag_resv_free_extent(pag, type, tp, len);
 	xfs_perag_put(pag);
 	if (error)
 		goto error0;
 
-	if (!isfl)
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
 	XFS_STATS_INC(mp, xs_freex);
 	XFS_STATS_ADD(mp, xs_freeb, len);
 
-	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+	trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+			haveleft, haveright);
 
 	return 0;
 
  error0:
-	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+	trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+			-1, -1);
 	if (bno_cur)
 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
 	if (cnt_cur)
@@ -1950,21 +1966,43 @@ xfs_alloc_compute_maxlevels(
 }
 
 /*
- * Find the length of the longest extent in an AG.
+ * Find the length of the longest extent in an AG.  The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
  */
 xfs_extlen_t
 xfs_alloc_longest_free_extent(
 	struct xfs_mount	*mp,
 	struct xfs_perag	*pag,
-	xfs_extlen_t		need)
+	xfs_extlen_t		need,
+	xfs_extlen_t		reserved)
 {
 	xfs_extlen_t		delta = 0;
 
+	/*
+	 * If the AGFL needs a recharge, we'll have to subtract that from the
+	 * longest extent.
+	 */
 	if (need > pag->pagf_flcount)
 		delta = need - pag->pagf_flcount;
 
+	/*
+	 * If we cannot maintain others' reservations with space from the
+	 * not-longest freesp extents, we'll have to subtract /that/ from
+	 * the longest extent too.
+	 */
+	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+	/*
+	 * If the longest extent is long enough to satisfy all the
+	 * reservations and AGFL rules in place, we can return this extent.
+	 */
 	if (pag->pagf_longest > delta)
 		return pag->pagf_longest - delta;
+
+	/* Otherwise, let the caller try for 1 block if there's space. */
 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
 }
 
@@ -2004,20 +2042,24 @@ xfs_alloc_space_available(
 {
 	struct xfs_perag	*pag = args->pag;
 	xfs_extlen_t		longest;
+	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
 
 	if (flags & XFS_ALLOC_FLAG_FREEING)
 		return true;
 
+	reservation = xfs_ag_resv_needed(pag, args->resv);
+
 	/* do we have enough contiguous free space for the allocation? */
-	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
+			reservation);
 	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
 		return false;
 
-	/* do have enough free space remaining for the allocation? */
+	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-			  min_free - args->total);
-	if (available < (int)args->minleft)
+			  reservation - min_free - args->total);
+	if (available < (int)args->minleft || available <= 0)
 		return false;
 
 	return true;
@@ -2058,7 +2100,7 @@ xfs_alloc_fix_freelist(
 	 * somewhere else if we are not being asked to try harder at this
 	 * point
 	 */
-	if (pag->pagf_metadata && args->userdata &&
+	if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
 	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
 		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
 		goto out_agbp_relse;
@@ -2124,7 +2166,7 @@ xfs_alloc_fix_freelist(
 		if (error)
 			goto out_agbp_relse;
 		error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
-					   &targs.oinfo, 1);
+					   &targs.oinfo, XFS_AG_RESV_AGFL);
 		if (error)
 			goto out_agbp_relse;
 		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2135,7 +2177,7 @@ xfs_alloc_fix_freelist(
 	targs.mp = mp;
 	targs.agbp = agbp;
 	targs.agno = args->agno;
-	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.alignment = targs.minlen = targs.prod = 1;
 	targs.type = XFS_ALLOCTYPE_THIS_AG;
 	targs.pag = pag;
 	error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2146,6 +2188,7 @@ xfs_alloc_fix_freelist(
 	while (pag->pagf_flcount < need) {
 		targs.agbno = 0;
 		targs.maxlen = need - pag->pagf_flcount;
+		targs.resv = XFS_AG_RESV_AGFL;
 
 		/* Allocate as many blocks as possible at once. */
 		error = xfs_alloc_ag_vextent(&targs);
@@ -2633,7 +2676,7 @@ xfs_alloc_vextent(
 		 * Try near allocation first, then anywhere-in-ag after
 		 * the first a.g. fails.
 		 */
-		if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
+		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
 		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
 			args->fsbno = XFS_AGB_TO_FSB(mp,
 					((mp->m_agfrotor / rotorstep) %
@@ -2766,7 +2809,7 @@ xfs_alloc_vextent(
 #endif
 
 		/* Zero the extent if we were asked to do so */
-		if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+		if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
 			error = xfs_zero_extent(args->ip, args->fsbno, args->len);
 			if (error)
 				goto error0;
@@ -2825,7 +2868,8 @@ xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
-	struct xfs_owner_info	*oinfo)	/* extent owner */
+	struct xfs_owner_info	*oinfo,	/* extent owner */
+	enum xfs_ag_resv_type	type)	/* block reservation type */
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_buf		*agbp;
@@ -2834,6 +2878,7 @@ xfs_free_extent(
 	int			error;
 
 	ASSERT(len != 0);
+	ASSERT(type != XFS_AG_RESV_AGFL);
 
 	if (XFS_TEST_ERROR(false, mp,
 			XFS_ERRTAG_FREE_EXTENT,
@@ -2851,7 +2896,7 @@ xfs_free_extent(
 		agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
 				err);
 
-	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
 	if (error)
 		goto err;
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6fe2d6b7cfe9..7c404a6b0ae3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {
 	xfs_extlen_t	len;		/* output: actual size of extent */
 	xfs_alloctype_t	type;		/* allocation type XFS_ALLOCTYPE_... */
 	xfs_alloctype_t	otype;		/* original allocation type */
+	int		datatype;	/* mask defining data type treatment */
 	char		wasdel;		/* set if allocation was prev delayed */
 	char		wasfromfl;	/* set if allocation is from freelist */
-	char		isfl;		/* set if is freelist blocks - !acctg */
-	char		userdata;	/* mask defining userdata treatment */
 	xfs_fsblock_t	firstblock;	/* io first block allocated */
 	struct xfs_owner_info	oinfo;	/* owner of blocks being allocated */
+	enum xfs_ag_resv_type	resv;	/* block reservation to use */
 } xfs_alloc_arg_t;
 
 /*
- * Defines for userdata
+ * Defines for datatype
  */
 #define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */
 #define XFS_ALLOC_USERDATA_ZERO		(1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY		(1 << 3)/* Busy extents not allowed */
+
+static inline bool
+xfs_alloc_is_userdata(int datatype)
+{
+	return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
+}
+
+static inline bool
+xfs_alloc_allow_busy_reuse(int datatype)
+{
+	return (datatype & XFS_ALLOC_NOBUSY) == 0;
+}
 
 /* freespace limit calculations */
 #define XFS_ALLOC_AGFL_RESERVE	4
@@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
 unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
 
 xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
-		struct xfs_perag *pag, xfs_extlen_t need);
+		struct xfs_perag *pag, xfs_extlen_t need,
+		xfs_extlen_t reserved);
 unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
@@ -184,7 +198,8 @@ xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
-	struct xfs_owner_info	*oinfo);/* extent owner */
+	struct xfs_owner_info	*oinfo,	/* extent owner */
+	enum xfs_ag_resv_type	type);	/* block reservation type */
 
 int				/* error */
 xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b060bca93402..9d7f61d36645 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,7 @@
 #include "xfs_attr_leaf.h"
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
@@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents(
  * Else, *lastxp will be set to the index of the found
  * entry; *gotp will contain the entry.
  */
-STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
 xfs_bmap_search_extents(
 	xfs_inode_t     *ip,            /* incore inode pointer */
 	xfs_fileoff_t   bno,            /* block number searched for */
@@ -3347,7 +3348,8 @@ xfs_bmap_adjacent(
 
 	mp = ap->ip->i_mount;
 	nullfb = *ap->firstblock == NULLFSBLOCK;
-	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+	rt = XFS_IS_REALTIME_INODE(ap->ip) &&
+		xfs_alloc_is_userdata(ap->datatype);
 	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	/*
 	 * If allocating at eof, and there's a previous real block,
@@ -3501,7 +3503,8 @@ xfs_bmap_longest_free_extent(
 	}
 
 	longest = xfs_alloc_longest_free_extent(mp, pag,
-					xfs_alloc_min_freelist(mp, pag));
+				xfs_alloc_min_freelist(mp, pag),
+				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
 	if (*blen < longest)
 		*blen = longest;
 
@@ -3622,7 +3625,7 @@ xfs_bmap_btalloc(
 {
 	xfs_mount_t	*mp;		/* mount point structure */
 	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
-	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_extlen_t	align = 0;	/* minimum allocation alignment */
 	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
 	xfs_agnumber_t	ag;
 	xfs_alloc_arg_t	args;
@@ -3645,7 +3648,8 @@ xfs_bmap_btalloc(
 	else if (mp->m_dalign)
 		stripe_align = mp->m_dalign;
 
-	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+	if (xfs_alloc_is_userdata(ap->datatype))
+		align = xfs_get_extsz_hint(ap->ip);
 	if (unlikely(align)) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 						align, 0, ap->eof, 0, ap->conv,
@@ -3658,7 +3662,8 @@ xfs_bmap_btalloc(
 	nullfb = *ap->firstblock == NULLFSBLOCK;
 	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	if (nullfb) {
-		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+		if (xfs_alloc_is_userdata(ap->datatype) &&
+		    xfs_inode_is_filestream(ap->ip)) {
 			ag = xfs_filestream_lookup_ag(ap->ip);
 			ag = (ag != NULLAGNUMBER) ? ag : 0;
 			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
@@ -3698,7 +3703,8 @@ xfs_bmap_btalloc(
 		 * enough for the request.  If one isn't found, then adjust
 		 * the minimum allocation size to the largest space found.
 		 */
-		if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+		if (xfs_alloc_is_userdata(ap->datatype) &&
+		    xfs_inode_is_filestream(ap->ip))
 			error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
 		else
 			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
@@ -3781,9 +3787,9 @@ xfs_bmap_btalloc(
 	}
 	args.minleft = ap->minleft;
 	args.wasdel = ap->wasdel;
-	args.isfl = 0;
-	args.userdata = ap->userdata;
-	if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+	args.resv = XFS_AG_RESV_NONE;
+	args.datatype = ap->datatype;
+	if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
 		args.ip = ap->ip;
 
 	error = xfs_alloc_vextent(&args);
@@ -3877,7 +3883,8 @@ STATIC int
 xfs_bmap_alloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
-	if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+	if (XFS_IS_REALTIME_INODE(ap->ip) &&
+	    xfs_alloc_is_userdata(ap->datatype))
 		return xfs_bmap_rtalloc(ap);
 	return xfs_bmap_btalloc(ap);
 }
@@ -4074,7 +4081,7 @@ xfs_bmapi_read(
 	return 0;
 }
 
-STATIC int
+int
 xfs_bmapi_reserve_delalloc(
 	struct xfs_inode	*ip,
 	xfs_fileoff_t		aoff,
@@ -4170,91 +4177,6 @@ out_unreserve_quota:
 	return error;
 }
 
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
-	struct xfs_inode	*ip,	/* incore inode */
-	xfs_fileoff_t		bno,	/* starting file offs. mapped */
-	xfs_filblks_t		len,	/* length to map in file */
-	struct xfs_bmbt_irec	*mval,	/* output: map values */
-	int			*nmap,	/* i/o: mval size/count */
-	int			flags)	/* XFS_BMAPI_... */
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	struct xfs_bmbt_irec	got;	/* current file extent record */
-	struct xfs_bmbt_irec	prev;	/* previous file extent record */
-	xfs_fileoff_t		obno;	/* old block number (offset) */
-	xfs_fileoff_t		end;	/* end of mapped file region */
-	xfs_extnum_t		lastx;	/* last useful extent number */
-	int			eof;	/* we've hit the end of extents */
-	int			n = 0;	/* current extent index */
-	int			error = 0;
-
-	ASSERT(*nmap >= 1);
-	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	if (unlikely(XFS_TEST_ERROR(
-	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
-	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
-	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
-		return -EFSCORRUPTED;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	XFS_STATS_INC(mp, xs_blk_mapw);
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
-		if (error)
-			return error;
-	}
-
-	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
-	end = bno + len;
-	obno = bno;
-
-	while (bno < end && n < *nmap) {
-		if (eof || got.br_startoff > bno) {
-			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
-							   &prev, &lastx, eof);
-			if (error) {
-				if (n == 0) {
-					*nmap = 0;
-					return error;
-				}
-				break;
-			}
-		}
-
-		/* set up the extent map to return. */
-		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
-		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-		/* If we're done, stop now. */
-		if (bno >= end || n >= *nmap)
-			break;
-
-		/* Else go on to the next record. */
-		prev = got;
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-		else
-			eof = 1;
-	}
-
-	*nmap = n;
-	return 0;
-}
-
-
 static int
 xfs_bmapi_allocate(
 	struct xfs_bmalloca	*bma)
@@ -4287,15 +4209,21 @@ xfs_bmapi_allocate(
 	}
 
 	/*
-	 * Indicate if this is the first user data in the file, or just any
-	 * user data. And if it is userdata, indicate whether it needs to
-	 * be initialised to zero during allocation.
+	 * Set the data type being allocated. For the data fork, the first data
+	 * in the file is treated differently to all other allocations. For the
+	 * attribute fork, we only need to ensure the allocated range is not on
+	 * the busy list.
 	 */
 	if (!(bma->flags & XFS_BMAPI_METADATA)) {
-		bma->userdata = (bma->offset == 0) ?
-			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+		bma->datatype = XFS_ALLOC_NOBUSY;
+		if (whichfork == XFS_DATA_FORK) {
+			if (bma->offset == 0)
+				bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+			else
+				bma->datatype |= XFS_ALLOC_USERDATA;
+		}
 		if (bma->flags & XFS_BMAPI_ZERO)
-			bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
+			bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
 	}
 
 	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4565,7 +4493,7 @@ xfs_bmapi_write(
 	bma.tp = tp;
 	bma.ip = ip;
 	bma.total = total;
-	bma.userdata = 0;
+	bma.datatype = 0;
 	bma.dfops = dfops;
 	bma.firstblock = firstblock;
 
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 254034f96941..8395f6e8cf7d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -54,7 +54,7 @@ struct xfs_bmalloca {
 	bool			wasdel;	/* replacing a delayed allocation */
 	bool			aeof;	/* allocated space at eof */
 	bool			conv;	/* overwriting unwritten extents */
-	char			userdata;/* userdata mask */
+	int			datatype;/* data type being allocated */
 	int			flags;
 };
 
@@ -181,9 +181,6 @@ int	xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
 		int *nmap, int flags);
-int	xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
-		xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-		int *nmap, int flags);
 int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
 		xfs_fsblock_t *firstblock, xfs_extlen_t total,
@@ -202,5 +199,12 @@ int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_defer_ops *dfops, enum shift_direction direction,
 		int num_exts);
 int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+struct xfs_bmbt_rec_host *
+	xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
+		int fork, int *eofp, xfs_extnum_t *lastxp,
+		struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
+int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
+		xfs_filblks_t len, struct xfs_bmbt_irec *got,
+		struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 08569792fe20..aa1752f918b8 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2070,7 +2070,7 @@ __xfs_btree_updkeys(
 	struct xfs_buf		*bp0,
 	bool			force_all)
 {
-	union xfs_btree_bigkey	key;	/* keys from current level */
+	union xfs_btree_key	key;	/* keys from current level */
 	union xfs_btree_key	*lkey;	/* keys from the next level up */
 	union xfs_btree_key	*hkey;
 	union xfs_btree_key	*nlkey;	/* keys from the next level up */
@@ -2086,7 +2086,7 @@ __xfs_btree_updkeys(
 
 	trace_xfs_btree_updkeys(cur, level, bp0);
 
-	lkey = (union xfs_btree_key *)&key;
+	lkey = &key;
 	hkey = xfs_btree_high_key_from_key(cur, lkey);
 	xfs_btree_get_keys(cur, block, lkey);
 	for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3226,7 @@ xfs_btree_insrec(
 	struct xfs_buf		*bp;	/* buffer for block */
 	union xfs_btree_ptr	nptr;	/* new block ptr */
 	struct xfs_btree_cur	*ncur;	/* new btree cursor */
-	union xfs_btree_bigkey	nkey;	/* new block key */
+	union xfs_btree_key	nkey;	/* new block key */
 	union xfs_btree_key	*lkey;
 	int			optr;	/* old key/record index */
 	int			ptr;	/* key/record index */
@@ -3241,7 +3241,7 @@ xfs_btree_insrec(
 	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
 
 	ncur = NULL;
-	lkey = (union xfs_btree_key *)&nkey;
+	lkey = &nkey;
 
 	/*
 	 * If we have an external root pointer, and we've made it to the
@@ -3444,14 +3444,14 @@ xfs_btree_insert(
 	union xfs_btree_ptr	nptr;	/* new block number (split result) */
 	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
 	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
-	union xfs_btree_bigkey	bkey;	/* key of block to insert */
+	union xfs_btree_key	bkey;	/* key of block to insert */
 	union xfs_btree_key	*key;
 	union xfs_btree_rec	rec;	/* record to insert */
 
 	level = 0;
 	ncur = NULL;
 	pcur = cur;
-	key = (union xfs_btree_key *)&bkey;
+	key = &bkey;
 
 	xfs_btree_set_ptr_null(cur, &nptr);
 
@@ -4797,3 +4797,50 @@ xfs_btree_query_range(
 	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
 			fn, priv);
 }
+
+/*
+ * Calculate the number of blocks needed to store a given number of records
+ * in a short-format (per-AG metadata) btree.
+ */
+xfs_extlen_t
+xfs_btree_calc_size(
+	struct xfs_mount	*mp,
+	uint			*limits,
+	unsigned long long	len)
+{
+	int			level;
+	int			maxrecs;
+	xfs_extlen_t		rval;
+
+	maxrecs = limits[0];
+	for (level = 0, rval = 0; len > 1; level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		maxrecs = limits[1];
+		rval += len;
+	}
+	return rval;
+}
+
+int
+xfs_btree_count_blocks_helper(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*data)
+{
+	xfs_extlen_t		*blocks = data;
+	(*blocks)++;
+
+	return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+	struct xfs_btree_cur	*cur,
+	xfs_extlen_t		*blocks)
+{
+	*blocks = 0;
+	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+			blocks);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 04d0865e5e6d..3f8556a5c2ad 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,18 @@ union xfs_btree_ptr {
 	__be64			l;	/* long form ptr */
 };
 
-union xfs_btree_key {
-	struct xfs_bmbt_key		bmbt;
-	xfs_bmdr_key_t			bmbr;	/* bmbt root block */
-	xfs_alloc_key_t			alloc;
-	struct xfs_inobt_key		inobt;
-	struct xfs_rmap_key		rmap;
-};
-
 /*
- * In-core key that holds both low and high keys for overlapped btrees.
- * The two keys are packed next to each other on disk, so do the same
- * in memory.  Preserve the existing xfs_btree_key as a single key to
- * avoid the mental model breakage that would happen if we passed a
- * bigkey into a function that operates on a single key.
+ * The in-core btree key.  Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both.  The __*bigkey
+ * items should never be accessed directly.
  */
-union xfs_btree_bigkey {
+union xfs_btree_key {
 	struct xfs_bmbt_key		bmbt;
 	xfs_bmdr_key_t			bmbr;	/* bmbt root block */
 	xfs_alloc_key_t			alloc;
 	struct xfs_inobt_key		inobt;
-	struct {
-		struct xfs_rmap_key	rmap;
-		struct xfs_rmap_key	rmap_hi;
-	};
+	struct xfs_rmap_key		rmap;
+	struct xfs_rmap_key		__rmap_bigkey[2];
 };
 
 union xfs_btree_rec {
@@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
 bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
 uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
 				 unsigned long len);
+xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
+		unsigned long long len);
 
 /* return codes */
 #define XFS_BTREE_QUERY_RANGE_CONTINUE	0	/* keep iterating */
@@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
 int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
 		xfs_btree_visit_blocks_fn fn, void *data);
 
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c221d0ecd52e..613c5cf19436 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
  *   - For each work item attached to the log intent item,
  *     * Perform the described action.
  *     * Attach the work item to the log done item.
+ *     * If the result of doing the work was -EAGAIN, ->finish work
+ *       wants a new transaction.  See the "Requesting a Fresh
+ *       Transaction while Finishing Deferred Work" section below for
+ *       details.
  *
  * The key here is that we must log an intent item for all pending
  * work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
  * we can perform complex remapping operations, chaining intent items
  * as needed.
  *
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation.  The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off.  The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish.  Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work.  Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue.  Then it rolls the
+ * transaction and picks up processing where it left off.  It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
  * This is an example of remapping the extent (E, E+B) into file X at
  * offset A and dealing with the extent (C, C+B) already being mapped
  * there:
@@ -104,21 +136,26 @@
  * | Intent to add rmap (X, E, A, B)                 |
  * +-------------------------------------------------+
  * | Reduce refcount for extent (C, B)               | t2
- * | Done reducing refcount for extent (C, B)        |
+ * | Done reducing refcount for extent (C, 9)        |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates)     |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9)           | t3
+ * | Done reducing refcount for extent (C+9, B-9)    |
  * | Increase refcount for extent (E, B)             |
  * | Done increasing refcount for extent (E, B)      |
  * | Intent to free extent (C, B)                    |
  * | Intent to free extent (F, 1) (refcountbt block) |
  * | Intent to remove rmap (F, 1, REFC)              |
  * +-------------------------------------------------+
- * | Remove rmap (X, C, A, B)                        | t3
+ * | Remove rmap (X, C, A, B)                        | t4
  * | Done removing rmap (X, C, A, B)                 |
  * | Add rmap (X, E, A, B)                           |
  * | Done adding rmap (X, E, A, B)                   |
  * | Remove rmap (F, 1, REFC)                        |
  * | Done removing rmap (F, 1, REFC)                 |
  * +-------------------------------------------------+
- * | Free extent (C, B)                              | t4
+ * | Free extent (C, B)                              | t5
  * | Done freeing extent (C, B)                      |
  * | Free extent (D, 1)                              |
  * | Done freeing extent (D, 1)                      |
@@ -141,6 +178,9 @@
  * - Intent to free extent (C, B)
  * - Intent to free extent (F, 1) (refcountbt block)
  * - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
  */
 
 static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
 			dfp->dfp_count--;
 			error = dfp->dfp_type->finish_item(*tp, dop, li,
 					dfp->dfp_done, &state);
-			if (error) {
+			if (error == -EAGAIN) {
+				/*
+				 * Caller wants a fresh transaction;
+				 * put the work item back on the list
+				 * and jump out.
+				 */
+				list_add(li, &dfp->dfp_work);
+				dfp->dfp_count++;
+				break;
+			} else if (error) {
 				/*
 				 * Clean up after ourselves and jump out.
 				 * xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
 				goto out;
 			}
 		}
-		/* Done with the dfp, free it. */
-		list_del(&dfp->dfp_list);
-		kmem_free(dfp);
+		if (error == -EAGAIN) {
+			/*
+			 * Caller wants a fresh transaction, so log a
+			 * new log intent item to replace the old one
+			 * and roll the transaction.  See "Requesting
+			 * a Fresh Transaction while Finishing
+			 * Deferred Work" above.
+			 */
+			dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+					dfp->dfp_count);
+			dfp->dfp_done = NULL;
+			list_for_each(li, &dfp->dfp_work)
+				dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
+						li);
+		} else {
+			/* Done with the dfp, free it. */
+			list_del(&dfp->dfp_list);
+			kmem_free(dfp);
+		}
 
 		if (cleanup_fn)
 			cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 31ca2208c03d..eab68ae2e011 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
 	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
 	return xfs_free_extent(cur->bc_tp,
 			XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
-			&oinfo);
+			&oinfo, XFS_AG_RESV_NONE);
 }
 
 STATIC int
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a6eed43fa7cd..fc5eef85d61e 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -647,9 +647,17 @@ struct xfs_rui_log_format {
 	__uint16_t		rui_size;	/* size of this item */
 	__uint32_t		rui_nextents;	/* # extents to free */
 	__uint64_t		rui_id;		/* rui identifier */
-	struct xfs_map_extent	rui_extents[1];	/* array of extents to rmap */
+	struct xfs_map_extent	rui_extents[];	/* array of extents to rmap */
 };
 
+static inline size_t
+xfs_rui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_rui_log_format) +
+			nr * sizeof(struct xfs_map_extent);
+}
+
 /*
  * This is the structure used to lay out an rud log item in the
  * log.  The rud_extents array is a variable size array whose
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7575cfc3ad15..4a28fa91e3b1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(
  * Update on-disk file size now that data has been written to disk.
  */
 STATIC int
-xfs_setfilesize(
+__xfs_setfilesize(
 	struct xfs_inode	*ip,
 	struct xfs_trans	*tp,
 	xfs_off_t		offset,
@@ -225,6 +225,23 @@ xfs_setfilesize(
 	return xfs_trans_commit(tp);
 }
 
+int
+xfs_setfilesize(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	size_t			size)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	return __xfs_setfilesize(ip, tp, offset, size);
+}
+
 STATIC int
 xfs_setfilesize_ioend(
 	struct xfs_ioend	*ioend,
@@ -247,7 +264,7 @@ xfs_setfilesize_ioend(
 		return error;
 	}
 
-	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 
 /*
@@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(
 {
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
 	uintptr_t		flags = (uintptr_t)private;
 	int			error = 0;
 
 	trace_xfs_end_io_direct_write(ip, offset, size);
 
-	if (XFS_FORCED_SHUTDOWN(mp))
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
 	if (size <= 0)
@@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(
 
 		error = xfs_iomap_write_unwritten(ip, offset, size);
 	} else if (flags & XFS_DIO_FLAG_APPEND) {
-		struct xfs_trans *tp;
-
 		trace_xfs_end_io_direct_write_append(ip, offset, size);
 
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
-				&tp);
-		if (!error)
-			error = xfs_setfilesize(ip, tp, offset, size);
+		error = xfs_setfilesize(ip, offset, size);
 	}
 
 	return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index bf2d9a141a73..1950e3bca2ac 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,6 +62,7 @@ int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
 
 int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
 		ssize_t size, void *private);
+int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4ece4f2ffc72..e827d657c314 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -182,7 +182,7 @@ xfs_bmap_rtalloc(
 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
 
 		/* Zero the extent if we were asked to do so */
-		if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+		if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
 			error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
 			if (error)
 				return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e455f9098d49..2975cb2319f4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -865,7 +865,7 @@ xfs_buf_item_log_segment(
 	 */
 	if (bit) {
 		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
-		mask = ((1 << (end_bit - bit)) - 1) << bit;
+		mask = ((1U << (end_bit - bit)) - 1) << bit;
 		*wordp |= mask;
 		wordp++;
 		bits_set = end_bit - bit;
@@ -888,7 +888,7 @@ xfs_buf_item_log_segment(
 	 */
 	end_bit = bits_to_set - bits_set;
 	if (end_bit) {
-		mask = (1 << end_bit) - 1;
+		mask = (1U << end_bit) - 1;
 		*wordp |= mask;
 	}
 }
@@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(
 	     bp->b_last_error != bp->b_error) {
 		bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
 		bp->b_last_error = bp->b_error;
-		if (cfg->retry_timeout && !bp->b_first_retry_time)
+		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+		    !bp->b_first_retry_time)
 			bp->b_first_retry_time = jiffies;
 
 		xfs_buf_ioerror(bp, 0);
@@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(
 	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
 	    ++bp->b_retries > cfg->max_retries)
 			goto permanent_error;
-	if (cfg->retry_timeout &&
+	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
 	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
 			goto permanent_error;
 
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index c263e079273e..162dc186cf04 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -384,7 +384,7 @@ restart:
 		 * If this is a metadata allocation, try to reuse the busy
 		 * extent instead of trimming the allocation.
 		 */
-		if (!args->userdata &&
+		if (!xfs_alloc_is_userdata(args->datatype) &&
 		    !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
 			if (!xfs_extent_busy_update_extent(args->mp, args->pag,
 							  busyp, fbno, flen,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e612a0233710..c68517b0f248 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -269,6 +269,8 @@ xfs_file_dio_aio_read(
 		return -EINVAL;
 	}
 
+	file_accessed(iocb->ki_filp);
+
 	/*
 	 * Locking is a bit tricky here. If we take an exclusive lock for direct
 	 * IO, we effectively serialise all new concurrent read IO to this file
@@ -323,7 +325,6 @@ xfs_file_dio_aio_read(
 	}
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 
-	file_accessed(iocb->ki_filp);
 	return ret;
 }
 
@@ -332,10 +333,7 @@ xfs_file_dax_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
-	struct address_space	*mapping = iocb->ki_filp->f_mapping;
-	struct inode		*inode = mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct iov_iter		data = *to;
+	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
 	size_t			count = iov_iter_count(to);
 	ssize_t			ret = 0;
 
@@ -345,11 +343,7 @@ xfs_file_dax_read(
 		return 0; /* skip atime */
 
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
-	if (ret > 0) {
-		iocb->ki_pos += ret;
-		iov_iter_advance(to, ret);
-	}
+	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	file_accessed(iocb->ki_filp);
@@ -711,70 +705,32 @@ xfs_file_dax_write(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
-	struct address_space	*mapping = iocb->ki_filp->f_mapping;
-	struct inode		*inode = mapping->host;
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0;
-	int			unaligned_io = 0;
-	int			iolock;
-	struct iov_iter		data;
+	int			iolock = XFS_IOLOCK_EXCL;
+	ssize_t			ret, error = 0;
+	size_t			count;
+	loff_t			pos;
 
-	/* "unaligned" here means not aligned to a filesystem block */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
-		unaligned_io = 1;
-		iolock = XFS_IOLOCK_EXCL;
-	} else if (mapping->nrpages) {
-		iolock = XFS_IOLOCK_EXCL;
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-	}
 	xfs_rw_ilock(ip, iolock);
-
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 
-	/*
-	 * Yes, even DAX files can have page cache attached to them:  A zeroed
-	 * page is inserted into the pagecache when we have to serve a write
-	 * fault on a hole.  It should never be dirtied and can simply be
-	 * dropped from the pagecache once we get real data for the page.
-	 *
-	 * XXX: This is racy against mmap, and there's nothing we can do about
-	 * it. dax_do_io() should really do this invalidation internally as
-	 * it will know if we've allocated over a holei for this specific IO and
-	 * if so it needs to update the mapping tree and invalidate existing
-	 * PTEs over the newly allocated range. Remove this invalidation when
-	 * dax_do_io() is fixed up.
-	 */
-	if (mapping->nrpages) {
-		loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+	pos = iocb->ki_pos;
+	count = iov_iter_count(from);
 
-		ret = invalidate_inode_pages2_range(mapping,
-						    iocb->ki_pos >> PAGE_SHIFT,
-						    end >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-	}
+	trace_xfs_file_dax_write(ip, count, pos);
 
-	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		iolock = XFS_IOLOCK_SHARED;
+	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+		i_size_write(inode, iocb->ki_pos);
+		error = xfs_setfilesize(ip, pos, ret);
 	}
 
-	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
-
-	data = *from;
-	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-			xfs_end_io_direct_write, 0);
-	if (ret > 0) {
-		iocb->ki_pos += ret;
-		iov_iter_advance(from, ret);
-	}
 out:
 	xfs_rw_iunlock(ip, iolock);
-	return ret;
+	return error ? error : ret;
 }
 
 STATIC ssize_t
@@ -1513,7 +1469,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
 	} else {
 		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1547,7 +1503,7 @@ xfs_filemap_fault(
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
-		ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 4a33a3304369..043ca3808ea2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -30,6 +30,7 @@
 #include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
+#include "xfs_ag_resv.h"
 
 struct xfs_fstrm_item {
 	struct xfs_mru_cache_elem	mru;
@@ -198,7 +199,8 @@ xfs_filestream_pick_ag(
 		}
 
 		longest = xfs_alloc_longest_free_extent(mp, pag,
-					xfs_alloc_min_freelist(mp, pag));
+				xfs_alloc_min_freelist(mp, pag),
+				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
 		if (((minlen && longest >= minlen) ||
 		     (!minlen && pag->pagf_freeblks >= minfree)) &&
 		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
@@ -369,7 +371,8 @@ xfs_filestream_new_ag(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_extlen_t		minlen = ap->length;
 	xfs_agnumber_t		startag = 0;
-	int			flags, err = 0;
+	int			flags = 0;
+	int			err = 0;
 	struct xfs_mru_cache_elem *mru;
 
 	*agp = NULLAGNUMBER;
@@ -385,8 +388,10 @@ xfs_filestream_new_ag(
 		startag = (item->ag + 1) % mp->m_sb.sb_agcount;
 	}
 
-	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
-	        (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0);
+	if (xfs_alloc_is_userdata(ap->datatype))
+		flags |= XFS_PICK_USERDATA;
+	if (ap->dfops->dop_low)
+		flags |= XFS_PICK_LOWSPACE;
 
 	err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0b7f986745c1..94ac06f3d908 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -553,7 +553,7 @@ xfs_growfs_data_private(
 		error = xfs_free_extent(tp,
 				XFS_AGB_TO_FSB(mp, agno,
 					be32_to_cpu(agf->agf_length) - new),
-				new, &oinfo);
+				new, &oinfo, XFS_AG_RESV_NONE);
 		if (error)
 			goto error0;
 	}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index fb39a66914dd..65b2e3f85f52 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1414,6 +1414,16 @@ xfs_inode_set_eofblocks_tag(
 	struct xfs_perag *pag;
 	int tagged;
 
+	/*
+	 * Don't bother locking the AG and looking up in the radix trees
+	 * if we already know that we have the tag set.
+	 */
+	if (ip->i_flags & XFS_IEOFBLOCKS)
+		return;
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags |= XFS_IEOFBLOCKS;
+	spin_unlock(&ip->i_flags_lock);
+
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
 	trace_xfs_inode_set_eofblocks_tag(ip);
@@ -1449,6 +1459,10 @@ xfs_inode_clear_eofblocks_tag(
 	struct xfs_mount *mp = ip->i_mount;
 	struct xfs_perag *pag;
 
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags &= ~XFS_IEOFBLOCKS;
+	spin_unlock(&ip->i_flags_lock);
+
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
 	trace_xfs_inode_clear_eofblocks_tag(ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e1a411e08f00..8f30d2533b48 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,6 +216,7 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 #define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
 #define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 #define XFS_IDONTCACHE		(1 << 9) /* don't cache the inode long term */
+#define XFS_IEOFBLOCKS		(1 << 10)/* has the preallocblocks tag set */
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2af0dda1c978..c08253e11545 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -42,17 +43,40 @@
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
-#define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
 
-STATIC int
-xfs_iomap_eof_align_last_fsb(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	xfs_extlen_t	extsize,
-	xfs_fileoff_t	*last_fsb)
+void
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static xfs_extlen_t
+xfs_eof_alignment(
+	struct xfs_inode	*ip,
+	xfs_extlen_t		extsize)
 {
-	xfs_extlen_t	align = 0;
-	int		eof, error;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_extlen_t		align = 0;
 
 	if (!XFS_IS_REALTIME_INODE(ip)) {
 		/*
@@ -83,8 +107,21 @@ xfs_iomap_eof_align_last_fsb(
 			align = extsize;
 	}
 
+	return align;
+}
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+	struct xfs_inode	*ip,
+	xfs_extlen_t		extsize,
+	xfs_fileoff_t		*last_fsb)
+{
+	xfs_extlen_t		align = xfs_eof_alignment(ip, extsize);
+
 	if (align) {
 		xfs_fileoff_t	new_last_fsb = roundup_64(*last_fsb, align);
+		int		eof, error;
+
 		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
 		if (error)
 			return error;
@@ -154,7 +191,7 @@ xfs_iomap_write_direct(
 		 */
 		ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
 								XFS_IFEXTENTS);
-		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+		error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
 		if (error)
 			goto out_unlock;
 	} else {
@@ -274,130 +311,6 @@ out_trans_cancel:
 	goto out_unlock;
 }
 
-/*
- * If the caller is doing a write at the end of the file, then extend the
- * allocation out to the file system's write iosize.  We clean up any extra
- * space left over when the file is closed in xfs_inactive().
- *
- * If we find we already have delalloc preallocation beyond EOF, don't do more
- * preallocation as it it not needed.
- */
-STATIC int
-xfs_iomap_eof_want_preallocate(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	size_t		count,
-	xfs_bmbt_irec_t *imap,
-	int		nimaps,
-	int		*prealloc)
-{
-	xfs_fileoff_t   start_fsb;
-	xfs_filblks_t   count_fsb;
-	int		n, error, imaps;
-	int		found_delalloc = 0;
-
-	*prealloc = 0;
-	if (offset + count <= XFS_ISIZE(ip))
-		return 0;
-
-	/*
-	 * If the file is smaller than the minimum prealloc and we are using
-	 * dynamic preallocation, don't do any preallocation at all as it is
-	 * likely this is the only write to the file that is going to be done.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
-	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
-		return 0;
-
-	/*
-	 * If there are any real blocks past eof, then don't
-	 * do any speculative allocation.
-	 */
-	start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
-	count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-	while (count_fsb > 0) {
-		imaps = nimaps;
-		error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
-				       0);
-		if (error)
-			return error;
-		for (n = 0; n < imaps; n++) {
-			if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
-			    (imap[n].br_startblock != DELAYSTARTBLOCK))
-				return 0;
-			start_fsb += imap[n].br_blockcount;
-			count_fsb -= imap[n].br_blockcount;
-
-			if (imap[n].br_startblock == DELAYSTARTBLOCK)
-				found_delalloc = 1;
-		}
-	}
-	if (!found_delalloc)
-		*prealloc = 1;
-	return 0;
-}
-
-/*
- * Determine the initial size of the preallocation. We are beyond the current
- * EOF here, but we need to take into account whether this is a sparse write or
- * an extending write when determining the preallocation size.  Hence we need to
- * look up the extent that ends at the current write offset and use the result
- * to determine the preallocation size.
- *
- * If the extent is a hole, then preallocation is essentially disabled.
- * Otherwise we take the size of the preceeding data extent as the basis for the
- * preallocation size. If the size of the extent is greater than half the
- * maximum extent length, then use the current offset as the basis. This ensures
- * that for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width alignment of
- * real extents.
- */
-STATIC xfs_fsblock_t
-xfs_iomap_eof_prealloc_initial_size(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*ip,
-	xfs_off_t		offset,
-	xfs_bmbt_irec_t		*imap,
-	int			nimaps)
-{
-	xfs_fileoff_t   start_fsb;
-	int		imaps = 1;
-	int		error;
-
-	ASSERT(nimaps >= imaps);
-
-	/* if we are using a specific prealloc size, return now */
-	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
-		return 0;
-
-	/* If the file is small, then use the minimum prealloc */
-	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
-		return 0;
-
-	/*
-	 * As we write multiple pages, the offset will always align to the
-	 * start of a page and hence point to a hole at EOF. i.e. if the size is
-	 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
-	 * will return FSB 1. Hence if there are blocks in the file, we want to
-	 * point to the block prior to the EOF block and not the hole that maps
-	 * directly at @offset.
-	 */
-	start_fsb = XFS_B_TO_FSB(mp, offset);
-	if (start_fsb)
-		start_fsb--;
-	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
-	if (error)
-		return 0;
-
-	ASSERT(imaps == 1);
-	if (imap[0].br_startblock == HOLESTARTBLOCK)
-		return 0;
-	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
-		return imap[0].br_blockcount << 1;
-	return XFS_B_TO_FSB(mp, offset);
-}
-
 STATIC bool
 xfs_quota_need_throttle(
 	struct xfs_inode *ip,
@@ -459,27 +372,76 @@ xfs_quota_calc_throttle(
 }
 
 /*
+ * If we are doing a write at the end of the file and there are no allocations
+ * past this one, then extend the allocation out to the file system's write
+ * iosize.
+ *
  * If we don't have a user specified preallocation size, dynamically increase
- * the preallocation size as the size of the file grows. Cap the maximum size
+ * the preallocation size as the size of the file grows.  Cap the maximum size
  * at a single extent or less if the filesystem is near full. The closer the
  * filesystem is to full, the smaller the maximum prealocation.
+ *
+ * As an exception we don't do any preallocation at all if the file is smaller
+ * than the minimum preallocation and we are using the default dynamic
+ * preallocation scheme, as it is likely this is the only write to the file that
+ * is going to be done.
+ *
+ * We clean up any extra space left over when the file is closed in
+ * xfs_inactive().
  */
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
-	struct xfs_mount	*mp,
 	struct xfs_inode	*ip,
-	xfs_off_t		offset,
-	struct xfs_bmbt_irec	*imap,
-	int			nimaps)
+	loff_t			offset,
+	loff_t			count,
+	xfs_extnum_t		idx,
+	struct xfs_bmbt_irec	*prev)
 {
-	xfs_fsblock_t		alloc_blocks = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	int			shift = 0;
 	int64_t			freesp;
 	xfs_fsblock_t		qblocks;
 	int			qshift = 0;
+	xfs_fsblock_t		alloc_blocks = 0;
+
+	if (offset + count <= XFS_ISIZE(ip))
+		return 0;
 
-	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
-							   imap, nimaps);
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+	    (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+		return 0;
+
+	/*
+	 * If an explicit allocsize is set, the file is small, or we
+	 * are writing behind a hole, then use the minimum prealloc:
+	 */
+	if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
+	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+	    idx == 0 ||
+	    prev->br_startoff + prev->br_blockcount < offset_fsb)
+		return mp->m_writeio_blocks;
+
+	/*
+	 * Determine the initial size of the preallocation. We are beyond the
+	 * current EOF here, but we need to take into account whether this is
+	 * a sparse write or an extending write when determining the
+	 * preallocation size.  Hence we need to look up the extent that ends
+	 * at the current write offset and use the result to determine the
+	 * preallocation size.
+	 *
+	 * If the extent is a hole, then preallocation is essentially disabled.
+	 * Otherwise we take the size of the preceding data extent as the basis
+	 * for the preallocation size. If the size of the extent is greater than
+	 * half the maximum extent length, then use the current offset as the
+	 * basis. This ensures that for large files the preallocation size
+	 * always extends to MAXEXTLEN rather than falling short due to things
+	 * like stripe unit/width alignment of real extents.
+	 */
+	if (prev->br_blockcount <= (MAXEXTLEN >> 1))
+		alloc_blocks = prev->br_blockcount << 1;
+	else
+		alloc_blocks = XFS_B_TO_FSB(mp, offset);
 	if (!alloc_blocks)
 		goto check_writeio;
 	qblocks = alloc_blocks;
@@ -550,120 +512,145 @@ xfs_iomap_prealloc_size(
 	 */
 	while (alloc_blocks && alloc_blocks >= freesp)
 		alloc_blocks >>= 4;
-
 check_writeio:
 	if (alloc_blocks < mp->m_writeio_blocks)
 		alloc_blocks = mp->m_writeio_blocks;
-
 	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
 				      mp->m_writeio_blocks);
-
 	return alloc_blocks;
 }
 
-int
-xfs_iomap_write_delay(
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	size_t		count,
-	xfs_bmbt_irec_t *ret_imap)
+static int
+xfs_file_iomap_begin_delay(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			count,
+	unsigned		flags,
+	struct iomap		*iomap)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_fileoff_t	offset_fsb;
-	xfs_fileoff_t	last_fsb;
-	xfs_off_t	aligned_offset;
-	xfs_fileoff_t	ioalign;
-	xfs_extlen_t	extsz;
-	int		nimaps;
-	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc;
-	int		error;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	/*
-	 * Make sure that the dquots are there. This doesn't hold
-	 * the ilock across a disk read.
-	 */
-	error = xfs_qm_dqattach_locked(ip, 0);
-	if (error)
-		return error;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		maxbytes_fsb =
+		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	xfs_fileoff_t		end_fsb, orig_end_fsb;
+	int			error = 0, eof = 0;
+	struct xfs_bmbt_irec	got;
+	struct xfs_bmbt_irec	prev;
+	xfs_extnum_t		idx;
 
-	extsz = xfs_get_extsz_hint(ip);
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	ASSERT(!XFS_IS_REALTIME_INODE(ip));
+	ASSERT(!xfs_get_extsz_hint(ip));
 
-	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-				imap, XFS_WRITE_IMAPS, &prealloc);
-	if (error)
-		return error;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-retry:
-	if (prealloc) {
-		xfs_fsblock_t	alloc_blocks;
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+		error = -EFSCORRUPTED;
+		goto out_unlock;
+	}
 
-		alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
-						       XFS_WRITE_IMAPS);
+	XFS_STATS_INC(mp, xs_blk_mapw);
 
-		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
-		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-		last_fsb = ioalign + alloc_blocks;
-	} else {
-		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			goto out_unlock;
 	}
 
-	if (prealloc || extsz) {
-		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-		if (error)
-			return error;
+	xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
+			&got, &prev);
+	if (!eof && got.br_startoff <= offset_fsb) {
+		trace_xfs_iomap_found(ip, offset, count, 0, &got);
+		goto done;
 	}
 
+	error = xfs_qm_dqattach_locked(ip, 0);
+	if (error)
+		goto out_unlock;
+
 	/*
-	 * Make sure preallocation does not create extents beyond the range we
-	 * actually support in this filesystem.
+	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+	 * to keep the chunks of work done where somewhat symmetric with the
+	 * work writeback does. This is a completely arbitrary number pulled
+	 * out of thin air as a best guess for initial testing.
+	 *
+	 * Note that the values needs to be less than 32-bits wide until
+	 * the lower level functions are updated.
 	 */
-	if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
-		last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+	end_fsb = orig_end_fsb =
+		min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+	if (eof) {
+		xfs_fsblock_t	prealloc_blocks;
 
-	ASSERT(last_fsb > offset_fsb);
+		prealloc_blocks =
+			xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+		if (prealloc_blocks) {
+			xfs_extlen_t	align;
+			xfs_off_t	end_offset;
 
-	nimaps = XFS_WRITE_IMAPS;
-	error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
-				imap, &nimaps, XFS_BMAPI_ENTIRE);
+			end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
+			end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+				prealloc_blocks;
+
+			align = xfs_eof_alignment(ip, 0);
+			if (align)
+				end_fsb = roundup_64(end_fsb, align);
+
+			end_fsb = min(end_fsb, maxbytes_fsb);
+			ASSERT(end_fsb > offset_fsb);
+		}
+	}
+
+retry:
+	error = xfs_bmapi_reserve_delalloc(ip, offset_fsb,
+			end_fsb - offset_fsb, &got,
+			&prev, &idx, eof);
 	switch (error) {
 	case 0:
+		break;
 	case -ENOSPC:
 	case -EDQUOT:
-		break;
-	default:
-		return error;
-	}
-
-	/*
-	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
-	 * without EOF preallocation.
-	 */
-	if (nimaps == 0) {
+		/* retry without any preallocation */
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (prealloc) {
-			prealloc = 0;
-			error = 0;
+		if (end_fsb != orig_end_fsb) {
+			end_fsb = orig_end_fsb;
 			goto retry;
 		}
-		return error ? error : -ENOSPC;
+		/*FALLTHRU*/
+	default:
+		goto out_unlock;
 	}
 
-	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-		return xfs_alert_fsblock_zero(ip, &imap[0]);
-
 	/*
 	 * Tag the inode as speculatively preallocated so we can reclaim this
 	 * space on demand, if necessary.
 	 */
-	if (prealloc)
+	if (end_fsb != orig_end_fsb)
 		xfs_inode_set_eofblocks_tag(ip);
 
-	*ret_imap = imap[0];
-	return 0;
+	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+done:
+	if (isnullstartblock(got.br_startblock))
+		got.br_startblock = DELAYSTARTBLOCK;
+
+	if (!got.br_startblock) {
+		error = xfs_alert_fsblock_zero(ip, &got);
+		if (error)
+			goto out_unlock;
+	}
+
+	xfs_bmbt_to_iomap(ip, iomap, &got);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
 }
 
 /*
@@ -947,37 +934,13 @@ error_on_bmapi_transaction:
 	return error;
 }
 
-void
-xfs_bmbt_to_iomap(
-	struct xfs_inode	*ip,
-	struct iomap		*iomap,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	if (imap->br_startblock == HOLESTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_HOLE;
-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_DELALLOC;
-	} else {
-		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
-		if (imap->br_state == XFS_EXT_UNWRITTEN)
-			iomap->type = IOMAP_UNWRITTEN;
-		else
-			iomap->type = IOMAP_MAPPED;
-	}
-	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
-}
-
-static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool imap_needs_alloc(struct inode *inode,
+		struct xfs_bmbt_irec *imap, int nimaps)
 {
 	return !nimaps ||
 		imap->br_startblock == HOLESTARTBLOCK ||
-		imap->br_startblock == DELAYSTARTBLOCK;
+		imap->br_startblock == DELAYSTARTBLOCK ||
+		(IS_DAX(inode) && ISUNWRITTEN(imap));
 }
 
 static int
@@ -993,11 +956,18 @@ xfs_file_iomap_begin(
 	struct xfs_bmbt_irec	imap;
 	xfs_fileoff_t		offset_fsb, end_fsb;
 	int			nimaps = 1, error = 0;
+	unsigned		lockmode;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if ((flags & IOMAP_WRITE) &&
+	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
+		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+				iomap);
+	}
+
+	lockmode = xfs_ilock_data_map_shared(ip);
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -1008,11 +978,11 @@ xfs_file_iomap_begin(
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, XFS_BMAPI_ENTIRE);
 	if (error) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_iunlock(ip, lockmode);
 		return error;
 	}
 
-	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
 		/*
 		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
 		 * pages to keep the chunks of work done where somewhat symmetric
@@ -1024,27 +994,23 @@ xfs_file_iomap_begin(
 		 * the lower level functions are updated.
 		 */
 		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
-		if (xfs_get_extsz_hint(ip)) {
-			/*
-			 * xfs_iomap_write_direct() expects the shared lock. It
-			 * is unlocked on return.
-			 */
-			xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
-			error = xfs_iomap_write_direct(ip, offset, length, &imap,
-					nimaps);
-		} else {
-			error = xfs_iomap_write_delay(ip, offset, length, &imap);
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
-
+		/*
+		 * xfs_iomap_write_direct() expects the shared lock. It
+		 * is unlocked on return.
+		 */
+		if (lockmode == XFS_ILOCK_EXCL)
+			xfs_ilock_demote(ip, lockmode);
+		error = xfs_iomap_write_direct(ip, offset, length, &imap,
+				nimaps);
 		if (error)
 			return error;
 
+		iomap->flags = IOMAP_F_NEW;
 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 	} else {
 		ASSERT(nimaps);
 
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_iunlock(ip, lockmode);
 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
 	}
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fb8aca3d69ab..6498be485932 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,8 +25,6 @@ struct xfs_bmbt_irec;
 
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
-			struct xfs_bmbt_irec *);
 int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 			struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 765f084759b5..2b6eec52178e 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -413,7 +413,8 @@ struct xlog {
 	/* log record crc error injection factor */
 	uint32_t		l_badcrc_factor;
 #endif
-
+	/* log recovery lsn tracking (for buffer submission */
+	xfs_lsn_t		l_recovery_lsn;
 };
 
 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e8638fd2c0c3..846483d56949 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -44,6 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_dir2.h"
 #include "xfs_rmap_item.h"
+#include "xfs_buf_item.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
 
@@ -381,6 +382,15 @@ xlog_recover_iodone(
 						SHUTDOWN_META_IO_ERROR);
 		}
 	}
+
+	/*
+	 * On v5 supers, a bli could be attached to update the metadata LSN.
+	 * Clean it up.
+	 */
+	if (bp->b_fspriv)
+		xfs_buf_item_relse(bp);
+	ASSERT(bp->b_fspriv == NULL);
+
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp);
 }
@@ -2360,12 +2370,14 @@ static void
 xlog_recover_validate_buf_type(
 	struct xfs_mount	*mp,
 	struct xfs_buf		*bp,
-	xfs_buf_log_format_t	*buf_f)
+	xfs_buf_log_format_t	*buf_f,
+	xfs_lsn_t		current_lsn)
 {
 	struct xfs_da_blkinfo	*info = bp->b_addr;
 	__uint32_t		magic32;
 	__uint16_t		magic16;
 	__uint16_t		magicda;
+	char			*warnmsg = NULL;
 
 	/*
 	 * We can only do post recovery validation on items on CRC enabled
@@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type(
 			bp->b_ops = &xfs_rmapbt_buf_ops;
 			break;
 		default:
-			xfs_warn(mp, "Bad btree block magic!");
-			ASSERT(0);
+			warnmsg = "Bad btree block magic!";
 			break;
 		}
 		break;
 	case XFS_BLFT_AGF_BUF:
 		if (magic32 != XFS_AGF_MAGIC) {
-			xfs_warn(mp, "Bad AGF block magic!");
-			ASSERT(0);
+			warnmsg = "Bad AGF block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_agf_buf_ops;
 		break;
 	case XFS_BLFT_AGFL_BUF:
 		if (magic32 != XFS_AGFL_MAGIC) {
-			xfs_warn(mp, "Bad AGFL block magic!");
-			ASSERT(0);
+			warnmsg = "Bad AGFL block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_agfl_buf_ops;
 		break;
 	case XFS_BLFT_AGI_BUF:
 		if (magic32 != XFS_AGI_MAGIC) {
-			xfs_warn(mp, "Bad AGI block magic!");
-			ASSERT(0);
+			warnmsg = "Bad AGI block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_agi_buf_ops;
@@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_GDQUOT_BUF:
 #ifdef CONFIG_XFS_QUOTA
 		if (magic16 != XFS_DQUOT_MAGIC) {
-			xfs_warn(mp, "Bad DQUOT block magic!");
-			ASSERT(0);
+			warnmsg = "Bad DQUOT block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_dquot_buf_ops;
@@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type(
 		break;
 	case XFS_BLFT_DINO_BUF:
 		if (magic16 != XFS_DINODE_MAGIC) {
-			xfs_warn(mp, "Bad INODE block magic!");
-			ASSERT(0);
+			warnmsg = "Bad INODE block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_inode_buf_ops;
 		break;
 	case XFS_BLFT_SYMLINK_BUF:
 		if (magic32 != XFS_SYMLINK_MAGIC) {
-			xfs_warn(mp, "Bad symlink block magic!");
-			ASSERT(0);
+			warnmsg = "Bad symlink block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_symlink_buf_ops;
@@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_DIR_BLOCK_BUF:
 		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
 		    magic32 != XFS_DIR3_BLOCK_MAGIC) {
-			xfs_warn(mp, "Bad dir block magic!");
-			ASSERT(0);
+			warnmsg = "Bad dir block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_dir3_block_buf_ops;
@@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_DIR_DATA_BUF:
 		if (magic32 != XFS_DIR2_DATA_MAGIC &&
 		    magic32 != XFS_DIR3_DATA_MAGIC) {
-			xfs_warn(mp, "Bad dir data magic!");
-			ASSERT(0);
+			warnmsg = "Bad dir data magic!";
 			break;
 		}
 		bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_DIR_FREE_BUF:
 		if (magic32 != XFS_DIR2_FREE_MAGIC &&
 		    magic32 != XFS_DIR3_FREE_MAGIC) {
-			xfs_warn(mp, "Bad dir3 free magic!");
-			ASSERT(0);
+			warnmsg = "Bad dir3 free magic!";
 			break;
 		}
 		bp->b_ops = &xfs_dir3_free_buf_ops;
@@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_DIR_LEAF1_BUF:
 		if (magicda != XFS_DIR2_LEAF1_MAGIC &&
 		    magicda != XFS_DIR3_LEAF1_MAGIC) {
-			xfs_warn(mp, "Bad dir leaf1 magic!");
-			ASSERT(0);
+			warnmsg = "Bad dir leaf1 magic!";
 			break;
 		}
 		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
@@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_DIR_LEAFN_BUF:
 		if (magicda != XFS_DIR2_LEAFN_MAGIC &&
 		    magicda != XFS_DIR3_LEAFN_MAGIC) {
-			xfs_warn(mp, "Bad dir leafn magic!");
-			ASSERT(0);
+			warnmsg = "Bad dir leafn magic!";
 			break;
 		}
 		bp->b_ops = &xfs_dir3_leafn_buf_ops;
@@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_DA_NODE_BUF:
 		if (magicda != XFS_DA_NODE_MAGIC &&
 		    magicda != XFS_DA3_NODE_MAGIC) {
-			xfs_warn(mp, "Bad da node magic!");
-			ASSERT(0);
+			warnmsg = "Bad da node magic!";
 			break;
 		}
 		bp->b_ops = &xfs_da3_node_buf_ops;
@@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type(
 	case XFS_BLFT_ATTR_LEAF_BUF:
 		if (magicda != XFS_ATTR_LEAF_MAGIC &&
 		    magicda != XFS_ATTR3_LEAF_MAGIC) {
-			xfs_warn(mp, "Bad attr leaf magic!");
-			ASSERT(0);
+			warnmsg = "Bad attr leaf magic!";
 			break;
 		}
 		bp->b_ops = &xfs_attr3_leaf_buf_ops;
 		break;
 	case XFS_BLFT_ATTR_RMT_BUF:
 		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
-			xfs_warn(mp, "Bad attr remote magic!");
-			ASSERT(0);
+			warnmsg = "Bad attr remote magic!";
 			break;
 		}
 		bp->b_ops = &xfs_attr3_rmt_buf_ops;
 		break;
 	case XFS_BLFT_SB_BUF:
 		if (magic32 != XFS_SB_MAGIC) {
-			xfs_warn(mp, "Bad SB block magic!");
-			ASSERT(0);
+			warnmsg = "Bad SB block magic!";
 			break;
 		}
 		bp->b_ops = &xfs_sb_buf_ops;
@@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type(
 			 xfs_blft_from_flags(buf_f));
 		break;
 	}
+
+	/*
+	 * Nothing else to do in the case of a NULL current LSN as this means
+	 * the buffer is more recent than the change in the log and will be
+	 * skipped.
+	 */
+	if (current_lsn == NULLCOMMITLSN)
+		return;
+
+	if (warnmsg) {
+		xfs_warn(mp, warnmsg);
+		ASSERT(0);
+	}
+
+	/*
+	 * We must update the metadata LSN of the buffer as it is written out to
+	 * ensure that older transactions never replay over this one and corrupt
+	 * the buffer. This can occur if log recovery is interrupted at some
+	 * point after the current transaction completes, at which point a
+	 * subsequent mount starts recovery from the beginning.
+	 *
+	 * Write verifiers update the metadata LSN from log items attached to
+	 * the buffer. Therefore, initialize a bli purely to carry the LSN to
+	 * the verifier. We'll clean it up in our ->iodone() callback.
+	 */
+	if (bp->b_ops) {
+		struct xfs_buf_log_item	*bip;
+
+		ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
+		bp->b_iodone = xlog_recover_iodone;
+		xfs_buf_item_init(bp, mp);
+		bip = bp->b_fspriv;
+		bip->bli_item.li_lsn = current_lsn;
+	}
 }
 
 /*
@@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer(
 	struct xfs_mount	*mp,
 	xlog_recover_item_t	*item,
 	struct xfs_buf		*bp,
-	xfs_buf_log_format_t	*buf_f)
+	xfs_buf_log_format_t	*buf_f,
+	xfs_lsn_t		current_lsn)
 {
 	int			i;
 	int			bit;
@@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer(
 	/* Shouldn't be any more regions */
 	ASSERT(i == item->ri_total);
 
-	xlog_recover_validate_buf_type(mp, bp, buf_f);
+	xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
 }
 
 /*
@@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer(
 	if (log->l_quotaoffs_flag & type)
 		return false;
 
-	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
 	return true;
 }
 
@@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2(
 	 */
 	lsn = xlog_recover_get_buf_lsn(mp, bp);
 	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
-		xlog_recover_validate_buf_type(mp, bp, buf_f);
+		trace_xfs_log_recover_buf_skip(log, buf_f);
+		xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
 		goto out_release;
 	}
 
@@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2(
 		if (!dirty)
 			goto out_release;
 	} else {
-		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
 	}
 
 	/*
@@ -3846,14 +3878,13 @@ STATIC int
 xlog_recover_commit_trans(
 	struct xlog		*log,
 	struct xlog_recover	*trans,
-	int			pass)
+	int			pass,
+	struct list_head	*buffer_list)
 {
 	int				error = 0;
-	int				error2;
 	int				items_queued = 0;
 	struct xlog_recover_item	*item;
 	struct xlog_recover_item	*next;
-	LIST_HEAD			(buffer_list);
 	LIST_HEAD			(ra_list);
 	LIST_HEAD			(done_list);
 
@@ -3876,7 +3907,7 @@ xlog_recover_commit_trans(
 			items_queued++;
 			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
 				error = xlog_recover_items_pass2(log, trans,
-						&buffer_list, &ra_list);
+						buffer_list, &ra_list);
 				list_splice_tail_init(&ra_list, &done_list);
 				items_queued = 0;
 			}
@@ -3894,15 +3925,14 @@ out:
 	if (!list_empty(&ra_list)) {
 		if (!error)
 			error = xlog_recover_items_pass2(log, trans,
-					&buffer_list, &ra_list);
+					buffer_list, &ra_list);
 		list_splice_tail_init(&ra_list, &done_list);
 	}
 
 	if (!list_empty(&done_list))
 		list_splice_init(&done_list, &trans->r_itemq);
 
-	error2 = xfs_buf_delwri_submit(&buffer_list);
-	return error ? error : error2;
+	return error;
 }
 
 STATIC void
@@ -4085,7 +4115,8 @@ xlog_recovery_process_trans(
 	char			*dp,
 	unsigned int		len,
 	unsigned int		flags,
-	int			pass)
+	int			pass,
+	struct list_head	*buffer_list)
 {
 	int			error = 0;
 	bool			freeit = false;
@@ -4109,7 +4140,8 @@ xlog_recovery_process_trans(
 		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
 		break;
 	case XLOG_COMMIT_TRANS:
-		error = xlog_recover_commit_trans(log, trans, pass);
+		error = xlog_recover_commit_trans(log, trans, pass,
+						  buffer_list);
 		/* success or fail, we are now done with this transaction. */
 		freeit = true;
 		break;
@@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr(
 	struct xlog_op_header	*ohead,
 	char			*dp,
 	char			*end,
-	int			pass)
+	int			pass,
+	struct list_head	*buffer_list)
 {
 	struct xlog_recover	*trans;
 	unsigned int		len;
+	int			error;
 
 	/* Do we understand who wrote this op? */
 	if (ohead->oh_clientid != XFS_TRANSACTION &&
@@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr(
 		return 0;
 	}
 
+	/*
+	 * The recovered buffer queue is drained only once we know that all
+	 * recovery items for the current LSN have been processed. This is
+	 * required because:
+	 *
+	 * - Buffer write submission updates the metadata LSN of the buffer.
+	 * - Log recovery skips items with a metadata LSN >= the current LSN of
+	 *   the recovery item.
+	 * - Separate recovery items against the same metadata buffer can share
+	 *   a current LSN. I.e., consider that the LSN of a recovery item is
+	 *   defined as the starting LSN of the first record in which its
+	 *   transaction appears, that a record can hold multiple transactions,
+	 *   and/or that a transaction can span multiple records.
+	 *
+	 * In other words, we are allowed to submit a buffer from log recovery
+	 * once per current LSN. Otherwise, we may incorrectly skip recovery
+	 * items and cause corruption.
+	 *
+	 * We don't know up front whether buffers are updated multiple times per
+	 * LSN. Therefore, track the current LSN of each commit log record as it
+	 * is processed and drain the queue when it changes. Use commit records
+	 * because they are ordered correctly by the logging code.
+	 */
+	if (log->l_recovery_lsn != trans->r_lsn &&
+	    ohead->oh_flags & XLOG_COMMIT_TRANS) {
+		error = xfs_buf_delwri_submit(buffer_list);
+		if (error)
+			return error;
+		log->l_recovery_lsn = trans->r_lsn;
+	}
+
 	return xlog_recovery_process_trans(log, trans, dp, len,
-					   ohead->oh_flags, pass);
+					   ohead->oh_flags, pass, buffer_list);
 }
 
 /*
@@ -4240,7 +4305,8 @@ xlog_recover_process_data(
 	struct hlist_head	rhash[],
 	struct xlog_rec_header	*rhead,
 	char			*dp,
-	int			pass)
+	int			pass,
+	struct list_head	*buffer_list)
 {
 	struct xlog_op_header	*ohead;
 	char			*end;
@@ -4254,6 +4320,7 @@ xlog_recover_process_data(
 	if (xlog_header_check_recover(log->l_mp, rhead))
 		return -EIO;
 
+	trace_xfs_log_recover_record(log, rhead, pass);
 	while ((dp < end) && num_logops) {
 
 		ohead = (struct xlog_op_header *)dp;
@@ -4262,7 +4329,7 @@ xlog_recover_process_data(
 
 		/* errors will abort recovery */
 		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
-						    dp, end, pass);
+						   dp, end, pass, buffer_list);
 		if (error)
 			return error;
 
@@ -4685,7 +4752,8 @@ xlog_recover_process(
 	struct hlist_head	rhash[],
 	struct xlog_rec_header	*rhead,
 	char			*dp,
-	int			pass)
+	int			pass,
+	struct list_head	*buffer_list)
 {
 	int			error;
 	__le32			crc;
@@ -4732,7 +4800,8 @@ xlog_recover_process(
 	if (error)
 		return error;
 
-	return xlog_recover_process_data(log, rhash, rhead, dp, pass);
+	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
+					 buffer_list);
 }
 
 STATIC int
@@ -4793,9 +4862,11 @@ xlog_do_recovery_pass(
 	char			*offset;
 	xfs_buf_t		*hbp, *dbp;
 	int			error = 0, h_size, h_len;
+	int			error2 = 0;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
+	LIST_HEAD		(buffer_list);
 
 	ASSERT(head_blk != tail_blk);
 	rhead_blk = 0;
@@ -4981,7 +5052,7 @@ xlog_do_recovery_pass(
 			}
 
 			error = xlog_recover_process(log, rhash, rhead, offset,
-						     pass);
+						     pass, &buffer_list);
 			if (error)
 				goto bread_err2;
 
@@ -5012,7 +5083,8 @@ xlog_do_recovery_pass(
 		if (error)
 			goto bread_err2;
 
-		error = xlog_recover_process(log, rhash, rhead, offset, pass);
+		error = xlog_recover_process(log, rhash, rhead, offset, pass,
+					     &buffer_list);
 		if (error)
 			goto bread_err2;
 
@@ -5025,10 +5097,17 @@ xlog_do_recovery_pass(
  bread_err1:
 	xlog_put_bp(hbp);
 
+	/*
+	 * Submit buffers that have been added from the last record processed,
+	 * regardless of error status.
+	 */
+	if (!list_empty(&buffer_list))
+		error2 = xfs_buf_delwri_submit(&buffer_list);
+
 	if (error && first_bad)
 		*first_bad = rhead_blk;
 
-	return error;
+	return error ? error : error2;
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index faeead671f9f..56e85a6c85c7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -934,6 +934,20 @@ xfs_mountfs(
 	}
 
 	/*
+	 * Now the log is fully replayed, we can transition to full read-only
+	 * mode for read-only mounts. This will sync all the metadata and clean
+	 * the log so that the recovery we just performed does not have to be
+	 * replayed again on the next mount.
+	 *
+	 * We use the same quiesce mechanism as the rw->ro remount, as they are
+	 * semantically identical operations.
+	 */
+	if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
+							XFS_MOUNT_RDONLY) {
+		xfs_quiesce_attr(mp);
+	}
+
+	/*
 	 * Complete the quota initialisation, post-log-replay component.
 	 */
 	if (quotamount) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b36676cde103..041d9493e798 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,10 +57,16 @@ enum {
 
 #define XFS_ERR_RETRY_FOREVER	-1
 
+/*
+ * Although retry_timeout is in jiffies which is normally an unsigned long,
+ * we limit the retry timeout to 86400 seconds, or one day.  So even a
+ * signed 32-bit long is sufficient for a HZ value up to 24855.  Making it
+ * signed lets us store the special "-1" value, meaning retry forever.
+ */
 struct xfs_error_cfg {
 	struct xfs_kobj	kobj;
 	int		max_retries;
-	unsigned long	retry_timeout;	/* in jiffies, 0 = no timeout */
+	long		retry_timeout;	/* in jiffies, -1 = infinite */
 };
 
 typedef struct xfs_mount {
@@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)
 }
 #endif
 
+/* per-AG block reservation data structures*/
+enum xfs_ag_resv_type {
+	XFS_AG_RESV_NONE = 0,
+	XFS_AG_RESV_METADATA,
+	XFS_AG_RESV_AGFL,
+};
+
+struct xfs_ag_resv {
+	/* number of blocks originally reserved here */
+	xfs_extlen_t			ar_orig_reserved;
+	/* number of blocks reserved here */
+	xfs_extlen_t			ar_reserved;
+	/* number of blocks originally asked for */
+	xfs_extlen_t			ar_asked;
+};
+
 /*
  * Per-ag incore structure, copies of information in agf and agi, to improve the
  * performance of allocation group selection.
@@ -372,8 +394,28 @@ typedef struct xfs_perag {
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
 	int		pagb_count;	/* pagb slots in use */
+
+	/* Blocks reserved for all kinds of metadata. */
+	struct xfs_ag_resv	pag_meta_resv;
+	/* Blocks reserved for just AGFL-based metadata. */
+	struct xfs_ag_resv	pag_agfl_resv;
 } xfs_perag_t;
 
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+	struct xfs_perag	*pag,
+	enum xfs_ag_resv_type	type)
+{
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+		return &pag->pag_meta_resv;
+	case XFS_AG_RESV_AGFL:
+		return &pag->pag_agfl_resv;
+	default:
+		return NULL;
+	}
+}
+
 extern void	xfs_uuid_table_free(void);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 2500f28689d5..0432a459871c 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -51,28 +51,16 @@ xfs_rui_item_free(
 		kmem_zone_free(xfs_rui_zone, ruip);
 }
 
-/*
- * This returns the number of iovecs needed to log the given rui item.
- * We only need 1 iovec for an rui item.  It just logs the rui_log_format
- * structure.
- */
-static inline int
-xfs_rui_item_sizeof(
-	struct xfs_rui_log_item *ruip)
-{
-	return sizeof(struct xfs_rui_log_format) +
-			(ruip->rui_format.rui_nextents - 1) *
-			sizeof(struct xfs_map_extent);
-}
-
 STATIC void
 xfs_rui_item_size(
 	struct xfs_log_item	*lip,
 	int			*nvecs,
 	int			*nbytes)
 {
+	struct xfs_rui_log_item	*ruip = RUI_ITEM(lip);
+
 	*nvecs += 1;
-	*nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
 }
 
 /*
@@ -97,7 +85,7 @@ xfs_rui_item_format(
 	ruip->rui_format.rui_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
-			xfs_rui_item_sizeof(ruip));
+			xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
 }
 
 /*
@@ -205,16 +193,12 @@ xfs_rui_init(
 
 {
 	struct xfs_rui_log_item		*ruip;
-	uint				size;
 
 	ASSERT(nextents > 0);
-	if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
-		size = (uint)(sizeof(struct xfs_rui_log_item) +
-			((nextents - 1) * sizeof(struct xfs_map_extent)));
-		ruip = kmem_zalloc(size, KM_SLEEP);
-	} else {
+	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+	else
 		ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
-	}
 
 	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
 	ruip->rui_format.rui_nextents = nextents;
@@ -239,14 +223,12 @@ xfs_rui_copy_format(
 	uint				len;
 
 	src_rui_fmt = buf->i_addr;
-	len = sizeof(struct xfs_rui_log_format) +
-			(src_rui_fmt->rui_nextents - 1) *
-			sizeof(struct xfs_map_extent);
+	len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
 
 	if (buf->i_len != len)
 		return -EFSCORRUPTED;
 
-	memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+	memcpy(dst_rui_fmt, src_rui_fmt, len);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index aefcc3a318a5..340c968e1f9c 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -70,6 +70,14 @@ struct xfs_rui_log_item {
 	struct xfs_rui_log_format	rui_format;
 };
 
+static inline size_t
+xfs_rui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_rui_log_item, rui_format) +
+			xfs_rui_log_format_sizeof(nr);
+}
+
 /*
  * This is the "rmap update done" log item.  It is used to log the fact that
  * some rmapbt updates mentioned in an earlier rui item have been performed.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fd6be45b3a1e..2d092f9577ca 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
  * Note: xfs_log_quiesce() stops background log work - the callers must ensure
  * it is started again when appropriate.
  */
-static void
+void
 xfs_quiesce_attr(
 	struct xfs_mount	*mp)
 {
@@ -1782,9 +1782,8 @@ xfs_init_zones(void)
 	if (!xfs_rud_zone)
 		goto out_destroy_icreate_zone;
 
-	xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
-			((XFS_RUI_MAX_FAST_EXTENTS - 1) *
-				sizeof(struct xfs_map_extent))),
+	xfs_rui_zone = kmem_zone_init(
+			xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
 			"xfs_rui_item");
 	if (!xfs_rui_zone)
 		goto out_destroy_rud_zone;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 529bce9fc37e..b6418abd85ad 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,6 +61,7 @@ struct xfs_mount;
 struct xfs_buftarg;
 struct block_device;
 
+extern void xfs_quiesce_attr(struct xfs_mount *mp);
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 79cfd3fc5324..5f8d55d29a11 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -393,9 +393,15 @@ max_retries_show(
 	struct kobject	*kobject,
 	char		*buf)
 {
+	int		retries;
 	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+		retries = -1;
+	else
+		retries = cfg->max_retries;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", retries);
 }
 
 static ssize_t
@@ -415,7 +421,10 @@ max_retries_store(
 	if (val < -1)
 		return -EINVAL;
 
-	cfg->max_retries = val;
+	if (val == -1)
+		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+	else
+		cfg->max_retries = val;
 	return count;
 }
 XFS_SYSFS_ATTR_RW(max_retries);
@@ -425,10 +434,15 @@ retry_timeout_seconds_show(
 	struct kobject	*kobject,
 	char		*buf)
 {
+	int		timeout;
 	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
 
-	return snprintf(buf, PAGE_SIZE, "%ld\n",
-			jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+		timeout = -1;
+	else
+		timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
 }
 
 static ssize_t
@@ -445,11 +459,16 @@ retry_timeout_seconds_store(
 	if (ret)
 		return ret;
 
-	/* 1 day timeout maximum */
-	if (val < 0 || val > 86400)
+	/* 1 day timeout maximum, -1 means infinite */
+	if (val < -1 || val > 86400)
 		return -EINVAL;
 
-	cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+	if (val == -1)
+		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+	else {
+		cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+		ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+	}
 	return count;
 }
 XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
@@ -519,18 +538,19 @@ struct xfs_error_init {
 static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
 	{ .name = "default",
 	  .max_retries = XFS_ERR_RETRY_FOREVER,
-	  .retry_timeout = 0,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
 	},
 	{ .name = "EIO",
 	  .max_retries = XFS_ERR_RETRY_FOREVER,
-	  .retry_timeout = 0,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
 	},
 	{ .name = "ENOSPC",
 	  .max_retries = XFS_ERR_RETRY_FOREVER,
-	  .retry_timeout = 0,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
 	},
 	{ .name = "ENODEV",
-	  .max_retries = 0,
+	  .max_retries = 0,	/* We can't recover from devices disappearing */
+	  .retry_timeout = 0,
 	},
 };
 
@@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(
 			goto out_error;
 
 		cfg->max_retries = init[i].max_retries;
-		cfg->retry_timeout = msecs_to_jiffies(
+		if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
+			cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+		else
+			cfg->retry_timeout = msecs_to_jiffies(
 					init[i].retry_timeout * MSEC_PER_SEC);
 	}
 	return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d303a665dba9..c6b2b1dcde75 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf,
 
 TRACE_EVENT(xfs_free_extent,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
-	TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+		 xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
+		 int haveright),
+	TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(int, isfl)
+		__field(int, resv)
 		__field(int, haveleft)
 		__field(int, haveright)
 	),
@@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent,
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->isfl = isfl;
+		__entry->resv = resv;
 		__entry->haveleft = haveleft;
 		__entry->haveright = haveright;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+	TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
-		  __entry->isfl,
+		  __entry->resv,
 		  __entry->haveleft ?
 			(__entry->haveright ? "both" : "left") :
 			(__entry->haveright ? "right" : "none"))
@@ -1622,8 +1623,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		__field(short, otype)
 		__field(char, wasdel)
 		__field(char, wasfromfl)
-		__field(char, isfl)
-		__field(char, userdata)
+		__field(int, resv)
+		__field(int, datatype)
 		__field(xfs_fsblock_t, firstblock)
 	),
 	TP_fast_assign(
@@ -1643,14 +1644,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		__entry->otype = args->otype;
 		__entry->wasdel = args->wasdel;
 		__entry->wasfromfl = args->wasfromfl;
-		__entry->isfl = args->isfl;
-		__entry->userdata = args->userdata;
+		__entry->resv = args->resv;
+		__entry->datatype = args->datatype;
 		__entry->firstblock = args->firstblock;
 	),
 	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
 		  "prod %u minleft %u total %u alignment %u minalignslop %u "
-		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
-		  "userdata %d firstblock 0x%llx",
+		  "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
+		  "datatype 0x%x firstblock 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
@@ -1667,8 +1668,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
 		  __entry->wasdel,
 		  __entry->wasfromfl,
-		  __entry->isfl,
-		  __entry->userdata,
+		  __entry->resv,
+		  __entry->datatype,
 		  (unsigned long long)__entry->firstblock)
 )
 
@@ -1984,6 +1985,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
 
+TRACE_EVENT(xfs_log_recover_record,
+	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+	TP_ARGS(log, rhead, pass),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, lsn)
+		__field(int, len)
+		__field(int, num_logops)
+		__field(int, pass)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->lsn = be64_to_cpu(rhead->h_lsn);
+		__entry->len = be32_to_cpu(rhead->h_len);
+		__entry->num_logops = be32_to_cpu(rhead->h_num_logops);
+		__entry->pass = pass;
+	),
+	TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lsn, __entry->len, __entry->num_logops,
+		   __entry->pass)
+)
+
 DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
 	TP_PROTO(struct xlog *log, struct xlog_recover *trans,
 		struct xlog_recover_item *item, int pass),
@@ -1992,6 +2016,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
 		__field(dev_t, dev)
 		__field(unsigned long, item)
 		__field(xlog_tid_t, tid)
+		__field(xfs_lsn_t, lsn)
 		__field(int, type)
 		__field(int, pass)
 		__field(int, count)
@@ -2001,15 +2026,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
 		__entry->dev = log->l_mp->m_super->s_dev;
 		__entry->item = (unsigned long)item;
 		__entry->tid = trans->r_log_tid;
+		__entry->lsn = trans->r_lsn;
 		__entry->type = ITEM_TYPE(item);
 		__entry->pass = pass;
 		__entry->count = item->ri_cnt;
 		__entry->total = item->ri_total;
 	),
-	TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
-		  "item region count/total %d/%d",
+	TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, "
+		  "item type %s item region count/total %d/%d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->tid,
+		  __entry->lsn,
 		  __entry->pass,
 		  (void *)__entry->item,
 		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
@@ -2068,6 +2095,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
@@ -2558,6 +2586,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
 
+/* per-AG reservation */
+DECLARE_EVENT_CLASS(xfs_ag_resv_class,
+	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
+		 xfs_extlen_t len),
+	TP_ARGS(pag, resv, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, resv)
+		__field(xfs_extlen_t, freeblks)
+		__field(xfs_extlen_t, flcount)
+		__field(xfs_extlen_t, reserved)
+		__field(xfs_extlen_t, asked)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		struct xfs_ag_resv	*r = xfs_perag_resv(pag, resv);
+
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->resv = resv;
+		__entry->freeblks = pag->pagf_freeblks;
+		__entry->flcount = pag->pagf_flcount;
+		__entry->reserved = r ? r->ar_reserved : 0;
+		__entry->asked = r ? r->ar_asked : 0;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->resv,
+		  __entry->freeblks,
+		  __entry->flcount,
+		  __entry->reserved,
+		  __entry->asked,
+		  __entry->len)
+)
+#define DEFINE_AG_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_ag_resv_class, name, \
+	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
+		 xfs_extlen_t len), \
+	TP_ARGS(pag, type, len))
+
+/* per-AG reservation tracepoints */
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5f3d33d16e67..70f42ea86dfb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -217,7 +217,7 @@ undo_log:
 
 undo_blocks:
 	if (blocks > 0) {
-		xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+		xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
 		tp->t_blk_res = 0;
 	}
 
@@ -318,7 +318,6 @@ xfs_trans_mod_sb(
 		 * in-core superblock's counter.  This should only
 		 * be applied to the on-disk superblock.
 		 */
-		ASSERT(delta < 0);
 		tp->t_res_fdblocks_delta += delta;
 		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
 			flags &= ~XFS_TRANS_SB_DIRTY;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 459ddec137a4..ab438647592a 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -79,7 +79,8 @@ xfs_trans_free_extent(
 
 	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
 
-	error = xfs_free_extent(tp, start_block, ext_len, oinfo);
+	error = xfs_free_extent(tp, start_block, ext_len, oinfo,
+			XFS_AG_RESV_NONE);
 
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index ea62245fee26..62900938f26d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -147,6 +147,7 @@ __xfs_xattr_put_listent(
 	arraytop = context->count + prefix_len + namelen + 1;
 	if (arraytop > context->firstu) {
 		context->count = -1;	/* insufficient space */
+		context->seen_enough = 1;
 		return 0;
 	}
 	offset = (char *)context->alist + context->count;