40 files changed, 2562 insertions, 254 deletions
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index c49cd7e796e7..682a59fabe3f 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -24,11 +24,6 @@ focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
 "exactly once" semantics and better control and throttling of the
 resources allocated for each client.
 
-Other NFSv4.1 features, Parallel NFS operations in particular,
-are still under development out of tree.
-See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
-for more information.
-
 The table below, taken from the NFSv4.1 document, lists
 the operations that are mandatory to implement (REQ), optional
 (OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -43,9 +38,7 @@ The OPTIONAL features identified and their abbreviations are as follows:
 The following abbreviations indicate the linux server implementation status.
 	I	Implemented NFSv4.1 operations.
 	NS	Not Supported.
-	NS*	unimplemented optional feature.
-	P	pNFS features implemented out of tree.
-	PNS	pNFS features that are not supported yet (out of tree).
+	NS*	Unimplemented optional feature.
 
 Operations
 
@@ -70,13 +63,13 @@ I  | DESTROY_SESSION      | REQ        |              | Section 18.37  |
 I  | EXCHANGE_ID          | REQ        |              | Section 18.35  |
 I  | FREE_STATEID         | REQ        |              | Section 18.38  |
    | GETATTR              | REQ        |              | Section 18.7   |
-P  | GETDEVICEINFO        | OPT        | pNFS (REQ)   | Section 18.40  |
-P  | GETDEVICELIST        | OPT        | pNFS (OPT)   | Section 18.41  |
+I  | GETDEVICEINFO        | OPT        | pNFS (REQ)   | Section 18.40  |
+NS*| GETDEVICELIST        | OPT        | pNFS (OPT)   | Section 18.41  |
    | GETFH                | REQ        |              | Section 18.8   |
 NS*| GET_DIR_DELEGATION   | OPT        | DDELG (REQ)  | Section 18.39  |
-P  | LAYOUTCOMMIT         | OPT        | pNFS (REQ)   | Section 18.42  |
-P  | LAYOUTGET            | OPT        | pNFS (REQ)   | Section 18.43  |
-P  | LAYOUTRETURN         | OPT        | pNFS (REQ)   | Section 18.44  |
+I  | LAYOUTCOMMIT         | OPT        | pNFS (REQ)   | Section 18.42  |
+I  | LAYOUTGET            | OPT        | pNFS (REQ)   | Section 18.43  |
+I  | LAYOUTRETURN         | OPT        | pNFS (REQ)   | Section 18.44  |
    | LINK                 | OPT        |              | Section 18.9   |
    | LOCK                 | REQ        |              | Section 18.10  |
    | LOCKT                | REQ        |              | Section 18.11  |
@@ -122,9 +115,9 @@ Callback Operations
    |                         | MNI       | or OPT)     |               |
    +-------------------------+-----------+-------------+---------------+
    | CB_GETATTR              | OPT       | FDELG (REQ) | Section 20.1  |
-P  | CB_LAYOUTRECALL         | OPT       | pNFS (REQ)  | Section 20.3  |
+I  | CB_LAYOUTRECALL         | OPT       | pNFS (REQ)  | Section 20.3  |
 NS*| CB_NOTIFY               | OPT       | DDELG (REQ) | Section 20.4  |
-P  | CB_NOTIFY_DEVICEID      | OPT       | pNFS (OPT)  | Section 20.12 |
+NS*| CB_NOTIFY_DEVICEID      | OPT       | pNFS (OPT)  | Section 20.12 |
 NS*| CB_NOTIFY_LOCK          | OPT       |             | Section 20.11 |
 NS*| CB_PUSH_DELEG           | OPT       | FDELG (OPT) | Section 20.5  |
    | CB_RECALL               | OPT       | FDELG,      | Section 20.2  |
diff --git a/Documentation/filesystems/nfs/pnfs-block-server.txt b/Documentation/filesystems/nfs/pnfs-block-server.txt
new file mode 100644
index 000000000000..2143673cf154
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs-block-server.txt
@@ -0,0 +1,37 @@
+pNFS block layout server user guide
+
+The Linux NFS server now supports the pNFS block layout extension.  In this
+case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
+to handling all the metadata access to the NFS export also hands out layouts
+to the clients to directly access the underlying block devices that are
+shared with the client.
+
+To use pNFS block layouts with with the Linux NFS server the exported file
+system needs to support the pNFS block layouts (currently just XFS), and the
+file system must sit on shared storage (typically iSCSI) that is accessible
+to the clients in addition to the MDS.  As of now the file system needs to
+sit directly on the exported volume, striping or concatenation of
+volumes on the MDS and clients is not supported yet.
+
+On the server, pNFS block volume support is automatically if the file system
+support it.  On the client make sure the kernel has the CONFIG_PNFS_BLOCK
+option enabled, the blkmapd daemon from nfs-utils is running, and the
+file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).
+
+If the nfsd server needs to fence a non-responding client it calls
+/sbin/nfsd-recall-failed with the first argument set to the IP address of
+the client, and the second argument set to the device node without the /dev
+prefix for the file system to be fenced. Below is an example file that shows
+how to translate the device into a serial number from SCSI EVPD 0x80:
+
+cat > /sbin/nfsd-recall-failed << EOF
+#!/bin/sh
+
+CLIENT="$1"
+DEV="/dev/$2"
+EVPD=`sg_inq --page=0x80 ${DEV} | \
+	grep "Unit serial number:" | \
+	awk -F ': ' '{print $2}'`
+
+echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
+EOF
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
 	/*
-	 * We can get away with a static buffer because we're only
-	 * called with BKL held.
+	 * We can get away with a static buffer because this is only called
+	 * from lockd, which is single-threaded.
 	 */
 	static char buf[2*NLM_MAXCOOKIELEN+1];
 	unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
 	return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
 
-static inline __be32 *
-nlm_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-	*p++ = htonl(NFS2_FHSIZE);
-	memcpy(p, f->data, NFS2_FHSIZE);
-	return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
-
 /*
  * Encode and decode owner handle
  */
diff --git a/fs/locks.c b/fs/locks.c
index 4d0d41163a50..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
 
 static bool lease_breaking(struct file_lock *fl)
@@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+		return false;
 	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
 		return false;
 	return locks_conflict(breaker, lease);
@@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
 	int ret = 0;
 	struct inode *inode = dentry->d_inode;
 
+	if (flags & FL_LAYOUT)
+		return 0;
+
 	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		return -EAGAIN;
 
@@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1661,7 +1666,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 */
 	error = -EAGAIN;
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-		if (fl->fl_file == filp) {
+		if (fl->fl_file == filp &&
+		    fl->fl_owner == lease->fl_owner) {
 			my_fl = fl;
 			continue;
 		}
@@ -1702,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
 		goto out;
@@ -1721,7 +1727,7 @@ out:
 	return error;
 }
 
-static int generic_delete_lease(struct file *filp)
+static int generic_delete_lease(struct file *filp, void *owner)
 {
 	int error = -EAGAIN;
 	struct file_lock *fl, *victim = NULL;
@@ -1737,7 +1743,8 @@ static int generic_delete_lease(struct file *filp)
 
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-		if (fl->fl_file == filp) {
+		if (fl->fl_file == filp &&
+		    fl->fl_owner == owner) {
 			victim = fl;
 			break;
 		}
@@ -1778,13 +1785,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 
 	switch (arg) {
 	case F_UNLCK:
-		return generic_delete_lease(filp);
+		return generic_delete_lease(filp, *priv);
 	case F_RDLCK:
 	case F_WRLCK:
 		if (!(*flp)->fl_lmops->lm_break) {
 			WARN_ON_ONCE(1);
 			return -ENOLCK;
 		}
+
 		return generic_add_lease(filp, arg, flp, priv);
 	default:
 		return -EINVAL;
@@ -1857,7 +1865,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
 	if (arg == F_UNLCK)
-		return vfs_setlease(filp, F_UNLCK, NULL, NULL);
+		return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
 	return do_fcntl_add_lease(fd, filp, arg);
 }
 
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
 
 	  If unsure, say N.
 
+config NFSD_PNFS
+	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+	depends on NFSD_V4
+	help
+	  This option enables support for the parallel NFS features of the
+	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+	  server.
+
+	  If unsure, say N.
+
 config NFSD_V4_SECURITY_LABEL
 	bool "Provide Security Label support for NFSv4 server"
 	depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
 # Makefile for the Linux nfs server
 #
 
+ccflags-y += -I$(src)			# needed for trace events
+
 obj-$(CONFIG_NFSD)	+= nfsd.o
 
-nfsd-y 			:= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y			+= trace.o
+
+nfsd-y 			+= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
 			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_PNFS
+
+
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev;
+	struct pnfs_block_volume *b;
+
+	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	gdp->gd_device = dev;
+
+	dev->nr_volumes = 1;
+	b = &dev->volumes[0];
+
+	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+			&b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	if (sb->s_bdev != sb->s_bdev->bd_contains)
+		return nfserr_inval;
+	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+		struct nfsd4_layoutget *args)
+{
+	struct nfsd4_layout_seg *seg = &args->lg_seg;
+	struct super_block *sb = inode->i_sb;
+	u32 block_size = (1 << inode->i_blkbits);
+	struct pnfs_block_extent *bex;
+	struct iomap iomap;
+	u32 device_generation = 0;
+	int error;
+
+	/*
+	 * We do not attempt to support I/O smaller than the fs block size,
+	 * or not aligned to it.
+	 */
+	if (args->lg_minlength < block_size) {
+		dprintk("pnfsd: I/O too small\n");
+		goto out_layoutunavailable;
+	}
+	if (seg->offset & (block_size - 1)) {
+		dprintk("pnfsd: I/O misaligned\n");
+		goto out_layoutunavailable;
+	}
+
+	/*
+	 * Some clients barf on non-zero block numbers for NONE or INVALID
+	 * layouts, so make sure to zero the whole structure.
+	 */
+	error = -ENOMEM;
+	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+	if (!bex)
+		goto out_error;
+	args->lg_content = bex;
+
+	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+					    &iomap, seg->iomode != IOMODE_READ,
+					    &device_generation);
+	if (error) {
+		if (error == -ENXIO)
+			goto out_layoutunavailable;
+		goto out_error;
+	}
+
+	if (iomap.length < args->lg_minlength) {
+		dprintk("pnfsd: extent smaller than minlength\n");
+		goto out_layoutunavailable;
+	}
+
+	switch (iomap.type) {
+	case IOMAP_MAPPED:
+		if (seg->iomode == IOMODE_READ)
+			bex->es = PNFS_BLOCK_READ_DATA;
+		else
+			bex->es = PNFS_BLOCK_READWRITE_DATA;
+		bex->soff = (iomap.blkno << 9);
+		break;
+	case IOMAP_UNWRITTEN:
+		if (seg->iomode & IOMODE_RW) {
+			/*
+			 * Crack monkey special case from section 2.3.1.
+			 */
+			if (args->lg_minlength == 0) {
+				dprintk("pnfsd: no soup for you!\n");
+				goto out_layoutunavailable;
+			}
+
+			bex->es = PNFS_BLOCK_INVALID_DATA;
+			bex->soff = (iomap.blkno << 9);
+			break;
+		}
+		/*FALLTHRU*/
+	case IOMAP_HOLE:
+		if (seg->iomode == IOMODE_READ) {
+			bex->es = PNFS_BLOCK_NONE_DATA;
+			break;
+		}
+		/*FALLTHRU*/
+	case IOMAP_DELALLOC:
+	default:
+		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+		goto out_layoutunavailable;
+	}
+
+	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+	if (error)
+		goto out_error;
+	bex->foff = iomap.offset;
+	bex->len = iomap.length;
+
+	seg->offset = iomap.offset;
+	seg->length = iomap.length;
+
+	dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+	return 0;
+
+out_error:
+	seg->length = 0;
+	return nfserrno(error);
+out_layoutunavailable:
+	seg->length = 0;
+	return nfserr_layoutunavailable;
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+		struct nfsd4_layoutcommit *lcp)
+{
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct iattr iattr = { .ia_valid = 0 };
+	struct iomap *iomaps;
+	int nr_iomaps;
+	int error;
+
+	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+	if (nr_iomaps < 0)
+		return nfserrno(nr_iomaps);
+
+	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+	    timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+		lcp->lc_mtime = current_fs_time(inode->i_sb);
+	iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+
+	if (new_size > i_size_read(inode)) {
+		iattr.ia_valid |= ATTR_SIZE;
+		iattr.ia_size = new_size;
+	}
+
+	error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+			nr_iomaps, &iattr);
+	kfree(iomaps);
+	return nfserrno(error);
+}
+
+const struct nfsd4_layout_ops bl_layout_ops = {
+	.proc_getdeviceinfo	= nfsd4_block_proc_getdeviceinfo,
+	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
+	.proc_layoutget		= nfsd4_block_proc_layoutget,
+	.encode_layoutget	= nfsd4_block_encode_layoutget,
+	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
+};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_PNFS
+
+
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct pnfs_block_extent *b = lgp->lg_content;
+	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+	if (!p)
+		return nfserr_toosmall;
+
+	*p++ = cpu_to_be32(len);
+	*p++ = cpu_to_be32(1);		/* we always return a single extent */
+
+	p = xdr_encode_opaque_fixed(p, &b->vol_id,
+			sizeof(struct nfsd4_deviceid));
+	p = xdr_encode_hyper(p, b->foff);
+	p = xdr_encode_hyper(p, b->len);
+	p = xdr_encode_hyper(p, b->soff);
+	*p++ = cpu_to_be32(b->es);
+	return 0;
+}
+
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+	__be32 *p;
+	int len;
+
+	switch (b->type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+		p = xdr_reserve_space(xdr, len);
+		if (!p)
+			return -ETOOSMALL;
+
+		*p++ = cpu_to_be32(b->type);
+		*p++ = cpu_to_be32(1);	/* single signature */
+		p = xdr_encode_hyper(p, b->simple.offset);
+		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
+	return len;
+}
+
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+	int len = sizeof(__be32), ret, i;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, len + sizeof(__be32));
+	if (!p)
+		return nfserr_resource;
+
+	for (i = 0; i < dev->nr_volumes; i++) {
+		ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+		if (ret < 0)
+			return nfserrno(ret);
+		len += ret;
+	}
+
+	/*
+	 * Fill in the overall length and number of volumes at the beginning
+	 * of the layout.
+	 */
+	*p++ = cpu_to_be32(len);
+	*p++ = cpu_to_be32(dev->nr_volumes);
+	return 0;
+}
+
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size)
+{
+	struct iomap *iomaps;
+	u32 nr_iomaps, expected, i;
+
+	if (len < sizeof(u32)) {
+		dprintk("%s: extent array too small: %u\n", __func__, len);
+		return -EINVAL;
+	}
+
+	nr_iomaps = be32_to_cpup(p++);
+	expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+	if (len != expected) {
+		dprintk("%s: extent array size mismatch: %u/%u\n",
+			__func__, len, expected);
+		return -EINVAL;
+	}
+
+	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+	if (!iomaps) {
+		dprintk("%s: failed to allocate extent array\n", __func__);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_iomaps; i++) {
+		struct pnfs_block_extent bex;
+
+		memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+		p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+
+		p = xdr_decode_hyper(p, &bex.foff);
+		if (bex.foff & (block_size - 1)) {
+			dprintk("%s: unaligned offset %lld\n",
+				__func__, bex.foff);
+			goto fail;
+		}
+		p = xdr_decode_hyper(p, &bex.len);
+		if (bex.len & (block_size - 1)) {
+			dprintk("%s: unaligned length %lld\n",
+				__func__, bex.foff);
+			goto fail;
+		}
+		p = xdr_decode_hyper(p, &bex.soff);
+		if (bex.soff & (block_size - 1)) {
+			dprintk("%s: unaligned disk offset %lld\n",
+				__func__, bex.soff);
+			goto fail;
+		}
+		bex.es = be32_to_cpup(p++);
+		if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+			dprintk("%s: incorrect extent state %d\n",
+				__func__, bex.es);
+			goto fail;
+		}
+
+		iomaps[i].offset = bex.foff;
+		iomaps[i].length = bex.len;
+	}
+
+	*iomapp = iomaps;
+	return nr_iomaps;
+fail:
+	kfree(iomaps);
+	return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+
+#include <linux/blkdev.h>
+#include "xdr4.h"
+
+struct iomap;
+struct xdr_stream;
+
+enum pnfs_block_extent_state {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2,
+	PNFS_BLOCK_NONE_DATA		= 3,
+};
+
+struct pnfs_block_extent {
+	struct nfsd4_deviceid		vol_id;
+	u64				foff;
+	u64				len;
+	u64				soff;
+	enum pnfs_block_extent_state	es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE		44
+
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
+};
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN	128
+
+struct pnfs_block_volume {
+	enum pnfs_block_volume_type	type;
+	union {
+		struct {
+			u64		offset;
+			u32		sig_len;
+			u8		sig[PNFS_BLOCK_UUID_LEN];
+		} simple;
+	};
+};
+
+struct pnfs_block_deviceaddr {
+	u32				nr_volumes;
+	struct pnfs_block_volume	volumes[];
+};
+
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+		struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+		struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+		u32 block_size);
+
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_client = dom;
 	exp.cd = cd;
+	exp.ex_devid_map = NULL;
 
 	/* expiry */
 	err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (!gid_valid(exp.ex_anon_gid))
 			goto out4;
 		err = 0;
+
+		nfsd4_setup_layout_type(&exp);
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
+	new->ex_layout_type = 0;
 	new->ex_uuid = NULL;
 	new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_anon_uid = item->ex_anon_uid;
 	new->ex_anon_gid = item->ex_anon_gid;
 	new->ex_fsid = item->ex_fsid;
+	new->ex_devid_map = item->ex_devid_map;
+	item->ex_devid_map = NULL;
 	new->ex_uuid = item->ex_uuid;
 	item->ex_uuid = NULL;
 	new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	item->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
 	item->ex_fslocs.migrated = 0;
+	new->ex_layout_type = item->ex_layout_type;
 	new->ex_nflavors = item->ex_nflavors;
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
 	struct nfsd4_fs_locations ex_fslocs;
 	uint32_t		ex_nflavors;
 	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	enum pnfs_layouttype	ex_layout_type;
+	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
 };
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
 	return status;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ *	struct layoutrecall_file4 {
+ *		nfs_fh4         lor_fh;
+ *		offset4         lor_offset;
+ *		length4         lor_length;
+ *		stateid4        lor_stateid;
+ *	};
+ *
+ *	union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ *	case LAYOUTRECALL4_FILE:
+ *		layoutrecall_file4 lor_layout;
+ *	case LAYOUTRECALL4_FSID:
+ *		fsid4              lor_fsid;
+ *	case LAYOUTRECALL4_ALL:
+ *		void;
+ *	};
+ *
+ *	struct CB_LAYOUTRECALL4args {
+ *		layouttype4             clora_type;
+ *		layoutiomode4           clora_iomode;
+ *		bool                    clora_changed;
+ *		layoutrecall4           clora_recall;
+ *	};
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+				  const struct nfs4_layout_stateid *ls,
+				  struct nfs4_cb_compound_hdr *hdr)
+{
+	__be32 *p;
+
+	BUG_ON(hdr->minorversion == 0);
+
+	p = xdr_reserve_space(xdr, 5 * 4);
+	*p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+	*p++ = cpu_to_be32(ls->ls_layout_type);
+	*p++ = cpu_to_be32(IOMODE_ANY);
+	*p++ = cpu_to_be32(1);
+	*p = cpu_to_be32(RETURN_FILE);
+
+	encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+
+	p = xdr_reserve_space(xdr, 2 * 8);
+	p = xdr_encode_hyper(p, 0);
+	xdr_encode_hyper(p, NFS4_MAX_UINT64);
+
+	encode_stateid4(xdr, &ls->ls_recall_sid);
+
+	hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const struct nfsd4_callback *cb)
+{
+	const struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = 0,
+		.minorversion = cb->cb_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_layout4args(xdr, ls, &hdr);
+	encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	enum nfsstat4 nfserr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		goto out;
+	if (cb) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status))
+			goto out;
+	}
+	status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+	if (unlikely(status))
+		goto out;
+	if (unlikely(nfserr != NFS4_OK))
+		status = nfs_cb_stat_to_errno(nfserr);
+out:
+	return status;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 /*
  * RPC procedure tables
  */
@@ -563,6 +659,9 @@ out:
 static struct rpc_procinfo nfs4_cb_procedures[] = {
 	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
 	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
+#endif
 };
 
 static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+
+struct nfs4_layout {
+	struct list_head		lo_perstate;
+	struct nfs4_layout_stateid	*lo_state;
+	struct nfsd4_layout_seg		lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS	8
+#define DEVID_HASH_SIZE	(1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK	(DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+	return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+	const struct knfsd_fh *fh = &fhp->fh_handle;
+	size_t fsid_len = key_len(fh->fh_fsid_type);
+	struct nfsd4_deviceid_map *map, *old;
+	int i;
+
+	map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+	if (!map)
+		return;
+
+	map->fsid_type = fh->fh_fsid_type;
+	memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+	spin_lock(&nfsd_devid_lock);
+	if (fhp->fh_export->ex_devid_map)
+		goto out_unlock;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+			if (old->fsid_type != fh->fh_fsid_type)
+				continue;
+			if (memcmp(old->fsid, fh->fh_fsid,
+					key_len(old->fsid_type)))
+				continue;
+
+			fhp->fh_export->ex_devid_map = old;
+			goto out_unlock;
+		}
+	}
+
+	map->idx = nfsd_devid_seq++;
+	list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+	fhp->fh_export->ex_devid_map = map;
+	map = NULL;
+
+out_unlock:
+	spin_unlock(&nfsd_devid_lock);
+	kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+	struct nfsd4_deviceid_map *map, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+		if (map->idx == idx)
+			ret = map;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation)
+{
+	if (!fhp->fh_export->ex_devid_map) {
+		nfsd4_alloc_devid_map(fhp);
+		if (!fhp->fh_export->ex_devid_map)
+			return -ENOMEM;
+	}
+
+	id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+	id->generation = device_generation;
+	id->pad = 0;
+	return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+	struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+
+	if (exp->ex_flags & NFSEXP_NOPNFS)
+		return;
+
+	if (sb->s_export_op->get_uuid &&
+	    sb->s_export_op->map_blocks &&
+	    sb->s_export_op->commit_blocks)
+		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_layout_stateid *ls = layoutstateid(stid);
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+	trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+
+	spin_lock(&clp->cl_lock);
+	list_del_init(&ls->ls_perclnt);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_del_init(&ls->ls_perfile);
+	spin_unlock(&fp->fi_lock);
+
+	vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+	fput(ls->ls_file);
+
+	if (ls->ls_recalled)
+		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+
+	kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+	struct file_lock *fl;
+	int status;
+
+	fl = locks_alloc_lock();
+	if (!fl)
+		return -ENOMEM;
+	locks_init_lock(fl);
+	fl->fl_lmops = &nfsd4_layouts_lm_ops;
+	fl->fl_flags = FL_LAYOUT;
+	fl->fl_type = F_RDLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner = ls;
+	fl->fl_pid = current->tgid;
+	fl->fl_file = ls->ls_file;
+
+	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+	if (status) {
+		locks_free_lock(fl);
+		return status;
+	}
+	BUG_ON(fl != NULL);
+	return 0;
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+		struct nfs4_stid *parent, u32 layout_type)
+{
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_file *fp = parent->sc_file;
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stp;
+
+	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+	if (!stp)
+		return NULL;
+	stp->sc_free = nfsd4_free_layout_stateid;
+	get_nfs4_file(fp);
+	stp->sc_file = fp;
+
+	ls = layoutstateid(stp);
+	INIT_LIST_HEAD(&ls->ls_perclnt);
+	INIT_LIST_HEAD(&ls->ls_perfile);
+	spin_lock_init(&ls->ls_lock);
+	INIT_LIST_HEAD(&ls->ls_layouts);
+	ls->ls_layout_type = layout_type;
+	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+			NFSPROC4_CLNT_CB_LAYOUT);
+
+	if (parent->sc_type == NFS4_DELEG_STID)
+		ls->ls_file = get_file(fp->fi_deleg_file);
+	else
+		ls->ls_file = find_any_file(fp);
+	BUG_ON(!ls->ls_file);
+
+	if (nfsd4_layout_setlease(ls)) {
+		put_nfs4_file(fp);
+		kmem_cache_free(nfs4_layout_stateid_cache, ls);
+		return NULL;
+	}
+
+	spin_lock(&clp->cl_lock);
+	stp->sc_type = NFS4_LAYOUT_STID;
+	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_add(&ls->ls_perfile, &fp->fi_lo_states);
+	spin_unlock(&fp->fi_lock);
+
+	trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+	return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stid;
+	unsigned char typemask = NFS4_LAYOUT_STID;
+	__be32 status;
+
+	if (create)
+		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+			net_generic(SVC_NET(rqstp), nfsd_net_id));
+	if (status)
+		goto out;
+
+	if (!fh_match(&cstate->current_fh.fh_handle,
+		      &stid->sc_file->fi_fhandle)) {
+		status = nfserr_bad_stateid;
+		goto out_put_stid;
+	}
+
+	if (stid->sc_type != NFS4_LAYOUT_STID) {
+		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+		nfs4_put_stid(stid);
+
+		status = nfserr_jukebox;
+		if (!ls)
+			goto out;
+	} else {
+		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+		status = nfserr_bad_stateid;
+		if (stateid->si_generation > stid->sc_stateid.si_generation)
+			goto out_put_stid;
+		if (layout_type != ls->ls_layout_type)
+			goto out_put_stid;
+	}
+
+	*lsp = ls;
+	return 0;
+
+out_put_stid:
+	nfs4_put_stid(stid);
+out:
+	return status;
+}
+
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+	spin_lock(&ls->ls_lock);
+	if (ls->ls_recalled)
+		goto out_unlock;
+
+	ls->ls_recalled = true;
+	atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+	if (list_empty(&ls->ls_layouts))
+		goto out_unlock;
+
+	trace_layout_recall(&ls->ls_stid.sc_stateid);
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	nfsd4_run_cb(&ls->ls_recall);
+
+out_unlock:
+	spin_unlock(&ls->ls_lock);
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+	u64 end = seg->offset + seg->length;
+	return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+	if (end == NFS4_MAX_UINT64)
+		lo->length = NFS4_MAX_UINT64;
+	else
+		lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+	if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+		return false;
+	if (layout_end(&lo->lo_seg) <= s->offset)
+		return false;
+	if (layout_end(s) <= lo->lo_seg.offset)
+		return false;
+	return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+	if (lo->iomode != new->iomode)
+		return false;
+	if (layout_end(new) < lo->offset)
+		return false;
+	if (layout_end(lo) < new->offset)
+		return false;
+
+	lo->offset = min(lo->offset, new->offset);
+	layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+	return true;
+}
+
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+	struct nfs4_layout_stateid *l, *n;
+	__be32 nfserr = nfs_ok;
+
+	assert_spin_locked(&fp->fi_lock);
+
+	list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+		if (l != ls) {
+			nfsd4_recall_file_layout(l);
+			nfserr = nfserr_recallconflict;
+		}
+	}
+
+	return nfserr;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+	struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+	struct nfs4_layout *lp, *new = NULL;
+	__be32 nfserr;
+
+	spin_lock(&fp->fi_lock);
+	nfserr = nfsd4_recall_conflict(ls);
+	if (nfserr)
+		goto out;
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+	spin_unlock(&ls->ls_lock);
+	spin_unlock(&fp->fi_lock);
+
+	new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+	if (!new)
+		return nfserr_jukebox;
+	memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+	new->lo_state = ls;
+
+	spin_lock(&fp->fi_lock);
+	nfserr = nfsd4_recall_conflict(ls);
+	if (nfserr)
+		goto out;
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+	new = NULL;
+done:
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	spin_unlock(&ls->ls_lock);
+out:
+	spin_unlock(&fp->fi_lock);
+	if (new)
+		kmem_cache_free(nfs4_layout_cache, new);
+	return nfserr;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+	while (!list_empty(reaplist)) {
+		struct nfs4_layout *lp = list_first_entry(reaplist,
+				struct nfs4_layout, lo_perstate);
+
+		list_del(&lp->lo_perstate);
+		nfs4_put_stid(&lp->lo_state->ls_stid);
+		kmem_cache_free(nfs4_layout_cache, lp);
+	}
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+		struct list_head *reaplist)
+{
+	struct nfsd4_layout_seg *lo = &lp->lo_seg;
+	u64 end = layout_end(lo);
+
+	if (seg->offset <= lo->offset) {
+		if (layout_end(seg) >= end) {
+			list_move_tail(&lp->lo_perstate, reaplist);
+			return;
+		}
+		end = seg->offset;
+	} else {
+		/* retain the whole layout segment on a split. */
+		if (layout_end(seg) < end) {
+			dprintk("%s: split not supported\n", __func__);
+			return;
+		}
+
+		lo->offset = layout_end(seg);
+	}
+
+	layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_layout *lp, *n;
+	LIST_HEAD(reaplist);
+	__be32 nfserr;
+	int found = 0;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+						false, lrp->lr_layout_type,
+						&ls);
+	if (nfserr) {
+		trace_layout_return_lookup_fail(&lrp->lr_sid);
+		return nfserr;
+	}
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+		if (layouts_overlapping(lp, &lrp->lr_seg)) {
+			nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+			found++;
+		}
+	}
+	if (!list_empty(&ls->ls_layouts)) {
+		if (found) {
+			update_stateid(&ls->ls_stid.sc_stateid);
+			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+				sizeof(stateid_t));
+		}
+		lrp->lrs_present = 1;
+	} else {
+		trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+		nfs4_unhash_stid(&ls->ls_stid);
+		lrp->lrs_present = 0;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	nfs4_put_stid(&ls->ls_stid);
+	nfsd4_free_layouts(&reaplist);
+	return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_layout *lp, *t;
+	LIST_HEAD(reaplist);
+
+	lrp->lrs_present = 0;
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+		if (lrp->lr_return_type == RETURN_FSID &&
+		    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+				   &cstate->current_fh.fh_handle))
+			continue;
+
+		spin_lock(&ls->ls_lock);
+		list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+			if (lrp->lr_seg.iomode == IOMODE_ANY ||
+			    lrp->lr_seg.iomode == lp->lo_seg.iomode)
+				list_move_tail(&lp->lo_perstate, &reaplist);
+		}
+		spin_unlock(&ls->ls_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+	return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+		struct list_head *reaplist)
+{
+	spin_lock(&ls->ls_lock);
+	list_splice_init(&ls->ls_layouts, reaplist);
+	spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+		nfsd4_return_all_layouts(ls, &reaplist);
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+		if (ls->ls_stid.sc_client == clp)
+			nfsd4_return_all_layouts(ls, &reaplist);
+	}
+	spin_unlock(&fp->fi_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	char addr_str[INET6_ADDRSTRLEN];
+	static char *envp[] = {
+		"HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[8];
+	int error;
+
+	rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+
+	nfsd4_cb_layout_fail(ls);
+
+	printk(KERN_WARNING
+		"nfsd: client %s failed to respond to layout recall. "
+		"  Fencing..\n", addr_str);
+
+	argv[0] = "/sbin/nfsd-recall-failed";
+	argv[1] = addr_str;
+	argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+	argv[3] = NULL;
+
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (error) {
+		printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+			addr_str, error);
+	}
+}
+
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	LIST_HEAD(reaplist);
+
+	switch (task->tk_status) {
+	case 0:
+		return 1;
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+		task->tk_status = 0;
+		return 1;
+	case -NFS4ERR_DELAY:
+		/* Poll the client until it's done with the layout */
+		/* FIXME: cap number of retries.
+		 * The pnfs standard states that we need to only expire
+		 * the client after at-least "lease time" .eg lease-time * 2
+		 * when failing to communicate a recall
+		 */
+		rpc_delay(task, HZ/100); /* 10 mili-seconds */
+		return 0;
+	default:
+		/*
+		 * Unknown error or non-responding client, we'll need to fence.
+		 */
+		nfsd4_cb_layout_fail(ls);
+		return -1;
+	}
+}
+
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	LIST_HEAD(reaplist);
+
+	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+
+	nfsd4_return_all_layouts(ls, &reaplist);
+	nfsd4_free_layouts(&reaplist);
+	nfs4_put_stid(&ls->ls_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+	.done		= nfsd4_cb_layout_done,
+	.release	= nfsd4_cb_layout_release,
+};
+
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+	/*
+	 * We don't want the locks code to timeout the lease for us;
+	 * we'll remove it ourself if a layout isn't returned
+	 * in time:
+	 */
+	fl->fl_break_time = 0;
+	nfsd4_recall_file_layout(fl->fl_owner);
+	return false;
+}
+
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+		struct list_head *dispose)
+{
+	BUG_ON(!(arg & F_UNLCK));
+	return lease_modify(onlist, arg, dispose);
+}
+
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+	.lm_break	= nfsd4_layout_lm_break,
+	.lm_change	= nfsd4_layout_lm_change,
+};
+
+int
+nfsd4_init_pnfs(void)
+{
+	int i;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+			sizeof(struct nfs4_layout), 0, 0, NULL);
+	if (!nfs4_layout_cache)
+		return -ENOMEM;
+
+	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	if (!nfs4_layout_stateid_cache) {
+		kmem_cache_destroy(nfs4_layout_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+	int i;
+
+	kmem_cache_destroy(nfs4_layout_cache);
+	kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		struct nfsd4_deviceid_map *map, *n;
+
+		list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+			kfree(map);
+	}
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+	if (!exp->ex_layout_type) {
+		dprintk("%s: export does not support pNFS\n", __func__);
+		return NULL;
+	}
+
+	if (exp->ex_layout_type != layout_type) {
+		dprintk("%s: layout type %d not supported\n",
+			__func__, layout_type);
+		return NULL;
+	}
+
+	return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	const struct nfsd4_layout_ops *ops;
+	struct nfsd4_deviceid_map *map;
+	struct svc_export *exp;
+	__be32 nfserr;
+
+	dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+	       __func__,
+	       gdp->gd_layout_type,
+	       gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+	       gdp->gd_maxcount);
+
+	map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+	if (!map) {
+		dprintk("%s: couldn't find device ID to export mapping!\n",
+			__func__);
+		return nfserr_noent;
+	}
+
+	exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+	if (IS_ERR(exp)) {
+		dprintk("%s: could not find device id\n", __func__);
+		return nfserr_noent;
+	}
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+	if (!ops)
+		goto out;
+
+	nfserr = nfs_ok;
+	if (gdp->gd_maxcount != 0)
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+	gdp->gd_notify_types &= ops->notify_types;
+	exp_put(exp);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutget *lgp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+	int accmode;
+
+	switch (lgp->lg_seg.iomode) {
+	case IOMODE_READ:
+		accmode = NFSD_MAY_READ;
+		break;
+	case IOMODE_RW:
+		accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n",
+			__func__, lgp->lg_seg.iomode);
+		nfserr = nfserr_badiomode;
+		goto out;
+	}
+
+	nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+	if (!ops)
+		goto out;
+
+	/*
+	 * Verify minlength and range as per RFC5661:
+	 *  o  If loga_length is less than loga_minlength,
+	 *     the metadata server MUST return NFS4ERR_INVAL.
+	 *  o  If the sum of loga_offset and loga_minlength exceeds
+	 *     NFS4_UINT64_MAX, and loga_minlength is not
+	 *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+	 *  o  If the sum of loga_offset and loga_length exceeds
+	 *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+	 *     the error NFS4ERR_INVAL MUST result.
+	 */
+	nfserr = nfserr_inval;
+	if (lgp->lg_seg.length < lgp->lg_minlength ||
+	    (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+	     lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+	    (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+	     lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+		goto out;
+	if (lgp->lg_seg.length == 0)
+		goto out;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+						true, lgp->lg_layout_type, &ls);
+	if (nfserr) {
+		trace_layout_get_lookup_fail(&lgp->lg_sid);
+		goto out;
+	}
+
+	nfserr = nfserr_recallconflict;
+	if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+		goto out_put_stid;
+
+	nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+				     current_fh, lgp);
+	if (nfserr)
+		goto out_put_stid;
+
+	nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutcommit *lcp)
+{
+	const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct inode *inode;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+	if (!ops)
+		goto out;
+	inode = current_fh->fh_dentry->d_inode;
+
+	nfserr = nfserr_inval;
+	if (new_size <= seg->offset) {
+		dprintk("pnfsd: last write before layout segment\n");
+		goto out;
+	}
+	if (new_size > seg->offset + seg->length) {
+		dprintk("pnfsd: last write beyond layout segment\n");
+		goto out;
+	}
+	if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+		dprintk("pnfsd: layoutcommit beyond EOF\n");
+		goto out;
+	}
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+						false, lcp->lc_layout_type,
+						&ls);
+	if (nfserr) {
+		trace_layout_commit_lookup_fail(&lcp->lc_sid);
+		/* fixup error code as per RFC5661 */
+		if (nfserr == nfserr_bad_stateid)
+			nfserr = nfserr_badlayout;
+		goto out;
+	}
+
+	nfserr = ops->proc_layoutcommit(inode, lcp);
+	if (nfserr)
+		goto out_put_stid;
+
+	if (new_size > i_size_read(inode)) {
+		lcp->lc_size_chg = 1;
+		lcp->lc_newsize = new_size;
+	} else {
+		lcp->lc_size_chg = 0;
+	}
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+		goto out;
+
+	switch (lrp->lr_seg.iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+	case IOMODE_ANY:
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n", __func__,
+			lrp->lr_seg.iomode);
+		nfserr = nfserr_inval;
+		goto out;
+	}
+
+	switch (lrp->lr_return_type) {
+	case RETURN_FILE:
+		nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+		break;
+	case RETURN_FSID:
+	case RETURN_ALL:
+		nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+		break;
+	default:
+		dprintk("%s: invalid return_type %d\n", __func__,
+			lrp->lr_return_type);
+		nfserr = nfserr_inval;
+		break;
+	}
+out:
+	return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 /*
  * NULL call.
  */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE		128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* logr_return_on_close */ +
+		op_encode_stateid_maxsz +
+		1 /* nr of layouts */ +
+		MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* locr_newsize */ +
+		2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* lrs_stateid */ +
+		op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_GETDEVICEINFO",
+	},
+	[OP_LAYOUTGET] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutget,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTGET",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+	},
+	[OP_LAYOUTCOMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTCOMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+	},
+	[OP_LAYOUTRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTRETURN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+	},
+#endif /* CONFIG_NFSD_PNFS */
 
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 532a60cca2fb..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
 	clp->cl_time = get_seconds();
 }
 
-static inline void
-renew_client(struct nfs4_client *clp)
-{
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	spin_lock(&nn->client_lock);
-	renew_client_locked(clp);
-	spin_unlock(&nn->client_lock);
-}
-
 static void put_client_renew_locked(struct nfs4_client *clp)
 {
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
 	kmem_cache_free(file_slab, fp);
 }
 
-static inline void
+void
 put_nfs4_file(struct nfs4_file *fi)
 {
 	might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
 	}
 }
 
-static inline void
-get_nfs4_file(struct nfs4_file *fi)
-{
-	atomic_inc(&fi->fi_ref);
-}
-
 static struct file *
 __nfs4_get_fd(struct nfs4_file *f, int oflag)
 {
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
 	return ret;
 }
 
-static struct file *
+struct file *
 find_any_file(struct nfs4_file *f)
 {
 	struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
 	return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
 }
 
-static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
-{
-	return fh1->fh_size == fh2->fh_size &&
-		!memcmp(fh1->fh_base.fh_pad,
-				fh2->fh_base.fh_pad,
-				fh1->fh_size);
-}
-
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
 
 static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
 		__nfs4_file_put_access(fp, O_RDONLY);
 }
 
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
 					 struct kmem_cache *slab)
 {
 	struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 	struct file *filp = NULL;
 
 	spin_lock(&fp->fi_lock);
-	if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+	if (fp->fi_deleg_file && --fp->fi_delegees == 0)
 		swap(filp, fp->fi_deleg_file);
 	spin_unlock(&fp->fi_lock);
 
 	if (filp) {
-		vfs_setlease(filp, F_UNLCK, NULL, NULL);
+		vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
 		fput(filp);
 	}
 }
 
-static void unhash_stid(struct nfs4_stid *s)
+void nfs4_unhash_stid(struct nfs4_stid *s)
 {
 	s->sc_type = 0;
 }
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
 
 	list_del_init(&stp->st_locks);
 	unhash_ol_stateid(stp);
-	unhash_stid(&stp->st_stid);
+	nfs4_unhash_stid(&stp->st_stid);
 }
 
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
 static int
 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
 {
-	if (clid->cl_boot == nn->boot_time)
+	/*
+	 * We're assuming the clid was not given out from a boot
+	 * precisely 2^32 (about 136 years) before this one.  That seems
+	 * a safe assumption:
+	 */
+	if (clid->cl_boot == (u32)nn->boot_time)
 		return 0;
 	dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
 		clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_callbacks);
 	INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
 	spin_lock_init(&clp->cl_lock);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
 		nfs4_get_stateowner(&oo->oo_owner);
 		release_openowner(oo);
 	}
+	nfsd4_return_all_client_layouts(clp);
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-	/* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
 	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
 
 	/* Referrals are supported, Migration is not. */
 	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 	fp->fi_share_deny = 0;
 	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&fp->fi_lo_states);
+	atomic_set(&fp->fi_lo_recalls, 0);
+#endif
 	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
 
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
 	struct nfs4_file *fp;
 
 	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
-		if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+		if (fh_match(&fp->fi_fhandle, fh)) {
 			if (atomic_inc_not_zero(&fp->fi_ref))
 				return fp;
 		}
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
 	return NULL;
 }
 
-static struct nfs4_file *
+struct nfs4_file *
 find_file(struct knfsd_fh *fh)
 {
 	struct nfs4_file *fp;
@@ -3856,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
 	/* Race breaker */
 	if (fp->fi_deleg_file) {
 		status = 0;
-		atomic_inc(&fp->fi_delegees);
+		++fp->fi_delegees;
 		hash_delegation_locked(dp, fp);
 		goto out_unlock;
 	}
 	fp->fi_deleg_file = filp;
-	atomic_set(&fp->fi_delegees, 1);
+	fp->fi_delegees = 1;
 	hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
@@ -3902,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		status = -EAGAIN;
 		goto out_unlock;
 	}
-	atomic_inc(&fp->fi_delegees);
+	++fp->fi_delegees;
 	hash_delegation_locked(dp, fp);
 	status = 0;
 out_unlock:
@@ -4295,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
 
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-	if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+	if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
 		return nfserr_bad_stateid;
 	return nfs_ok;
 }
@@ -4446,7 +4439,7 @@ out_unlock:
 	return status;
 }
 
-static __be32
+__be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     stateid_t *stateid, unsigned char typemask,
 		     struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4860,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
+	nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+				      stp->st_stid.sc_file);
+
 	nfsd4_close_open_stateid(stp);
 
 	/* put reference from nfs4_preprocess_seqid_op */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 	return ret;
 }
 
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+	DECODE_HEAD;
+	u64 sec;
+
+	READ_BUF(12);
+	p = xdr_decode_hyper(p, &sec);
+	tv->tv_sec = sec;
+	tv->tv_nsec = be32_to_cpup(p++);
+	if (tv->tv_nsec >= (u32)1000000000)
+		return nfserr_inval;
+
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 {
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 {
 	int expected_len, len = 0;
 	u32 dummy32;
-	u64 sec;
 	char *buf;
 
 	DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		dummy32 = be32_to_cpup(p++);
 		switch (dummy32) {
 		case NFS4_SET_TO_CLIENT_TIME:
-			/* We require the high 32 bits of 'seconds' to be 0, and we ignore
-			   all 32 bits of 'nseconds'. */
-			READ_BUF(12);
 			len += 12;
-			p = xdr_decode_hyper(p, &sec);
-			iattr->ia_atime.tv_sec = (time_t)sec;
-			iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
-			if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
-				return nfserr_inval;
+			status = nfsd4_decode_time(argp, &iattr->ia_atime);
+			if (status)
+				return status;
 			iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
 			break;
 		case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		dummy32 = be32_to_cpup(p++);
 		switch (dummy32) {
 		case NFS4_SET_TO_CLIENT_TIME:
-			/* We require the high 32 bits of 'seconds' to be 0, and we ignore
-			   all 32 bits of 'nseconds'. */
-			READ_BUF(12);
 			len += 12;
-			p = xdr_decode_hyper(p, &sec);
-			iattr->ia_mtime.tv_sec = sec;
-			iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
-			if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
-				return nfserr_inval;
+			status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+			if (status)
+				return status;
 			iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
 			break;
 		case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 	DECODE_TAIL;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	DECODE_HEAD;
+	u32 num, i;
+
+	READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+	COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+	gdev->gd_layout_type = be32_to_cpup(p++);
+	gdev->gd_maxcount = be32_to_cpup(p++);
+	num = be32_to_cpup(p++);
+	if (num) {
+		READ_BUF(4 * num);
+		gdev->gd_notify_types = be32_to_cpup(p++);
+		for (i = 1; i < num; i++) {
+			if (be32_to_cpup(p++)) {
+				status = nfserr_inval;
+				goto out;
+			}
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutget *lgp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(36);
+	lgp->lg_signal = be32_to_cpup(p++);
+	lgp->lg_layout_type = be32_to_cpup(p++);
+	lgp->lg_seg.iomode = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+	p = xdr_decode_hyper(p, &lgp->lg_minlength);
+	nfsd4_decode_stateid(argp, &lgp->lg_sid);
+	READ_BUF(4);
+	lgp->lg_maxcount = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutcommit *lcp)
+{
+	DECODE_HEAD;
+	u32 timechange;
+
+	READ_BUF(20);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+	lcp->lc_reclaim = be32_to_cpup(p++);
+	nfsd4_decode_stateid(argp, &lcp->lc_sid);
+	READ_BUF(4);
+	lcp->lc_newoffset = be32_to_cpup(p++);
+	if (lcp->lc_newoffset) {
+		READ_BUF(8);
+		p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+	} else
+		lcp->lc_last_wr = 0;
+	READ_BUF(4);
+	timechange = be32_to_cpup(p++);
+	if (timechange) {
+		status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+		if (status)
+			return status;
+	} else {
+		lcp->lc_mtime.tv_nsec = UTIME_NOW;
+	}
+	READ_BUF(8);
+	lcp->lc_layout_type = be32_to_cpup(p++);
+
+	/*
+	 * Save the layout update in XDR format and let the layout driver deal
+	 * with it later.
+	 */
+	lcp->lc_up_len = be32_to_cpup(p++);
+	if (lcp->lc_up_len > 0) {
+		READ_BUF(lcp->lc_up_len);
+		READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutreturn *lrp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(16);
+	lrp->lr_reclaim = be32_to_cpup(p++);
+	lrp->lr_layout_type = be32_to_cpup(p++);
+	lrp->lr_seg.iomode = be32_to_cpup(p++);
+	lrp->lr_return_type = be32_to_cpup(p++);
+	if (lrp->lr_return_type == RETURN_FILE) {
+		READ_BUF(16);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+		nfsd4_decode_stateid(argp, &lrp->lr_sid);
+		READ_BUF(4);
+		lrp->lrf_body_len = be32_to_cpup(p++);
+		if (lrp->lrf_body_len > 0) {
+			READ_BUF(lrp->lrf_body_len);
+			READMEM(lrp->lrf_body, lrp->lrf_body_len);
+		}
+	} else {
+		lrp->lr_seg.offset = 0;
+		lrp->lr_seg.length = NFS4_MAX_UINT64;
+	}
+
+	DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
 		       struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
 	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_free_stateid,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
 			get_parent_attributes(exp, &stat);
 		p = xdr_encode_hyper(p, stat.ino);
 	}
+#ifdef CONFIG_NFSD_PNFS
+	if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+	    (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+		if (exp->ex_layout_type) {
+			p = xdr_reserve_space(xdr, 8);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(1);
+			*p++ = cpu_to_be32(exp->ex_layout_type);
+		} else {
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(0);
+		}
+	}
+
+	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.blksize);
+	}
+#endif /* CONFIG_NFSD_PNFS */
 	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 		status = nfsd4_encode_security_label(xdr, rqstp, context,
 								contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	if (entry_bytes > cd->rd_maxcount)
 		goto fail;
 	cd->rd_maxcount -= entry_bytes;
-	if (!cd->rd_dircount)
-		goto fail;
 	/*
 	 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
 	 * let's always let through the first entry, at least:
 	 */
-	name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+	if (!cd->rd_dircount)
+		goto fail;
+	name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
 	if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
 		goto fail;
 	cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+
 	cd->cookie_offset = cookie_offset;
 skip_entry:
 	cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfserr;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[gdev->gd_layout_type];
+	u32 starting_len = xdr->buf->len, needed_len;
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(gdev->gd_layout_type);
+
+	/* If maxcount is 0 then just update notifications */
+	if (gdev->gd_maxcount != 0) {
+		nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+		if (nfserr) {
+			/*
+			 * We don't bother to burden the layout drivers with
+			 * enforcing gd_maxcount, just tell the client to
+			 * come back with a bigger buffer if it's not enough.
+			 */
+			if (xdr->buf->len + 4 > gdev->gd_maxcount)
+				goto toosmall;
+			goto out;
+		}
+	}
+
+	nfserr = nfserr_resource;
+	if (gdev->gd_notify_types) {
+		p = xdr_reserve_space(xdr, 4 + 4);
+		if (!p)
+			goto out;
+		*p++ = cpu_to_be32(1);			/* bitmap length */
+		*p++ = cpu_to_be32(gdev->gd_notify_types);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out;
+		*p++ = 0;
+	}
+
+	nfserr = 0;
+out:
+	kfree(gdev->gd_device);
+	dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+	return nfserr;
+
+toosmall:
+	dprintk("%s: maxcount too small\n", __func__);
+	needed_len = xdr->buf->len + 4 /* notifications */;
+	xdr_truncate_encode(xdr, starting_len);
+	p = xdr_reserve_space(xdr, 4);
+	if (!p) {
+		nfserr = nfserr_resource;
+	} else {
+		*p++ = cpu_to_be32(needed_len);
+		nfserr = nfserr_toosmall;
+	}
+	goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
+	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+				    sizeof(stateid_opaque_t));
+
+	*p++ = cpu_to_be32(1);	/* we always return a single layout */
+	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+	p = xdr_encode_hyper(p, lgp->lg_seg.length);
+	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
+	*p++ = cpu_to_be32(lgp->lg_layout_type);
+
+	nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+	kfree(lgp->lg_content);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  struct nfsd4_layoutcommit *lcp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lcp->lc_size_chg);
+	if (lcp->lc_size_chg) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_hyper(p, lcp->lc_newsize);
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lrp->lrs_present);
+	if (lrp->lrs_present)
+		nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+	return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_slabs();
 	if (retval)
 		goto out_unregister_pernet;
-	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
+	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	if (retval)
+		goto out_exit_pnfs;
 	nfsd_stat_init();	/* Statistics */
 	retval = nfsd_reply_cache_init();
 	if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+	nfsd4_exit_pnfs();
 out_free_slabs:
 	nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
+	nfsd4_exit_pnfs();
 	nfsd_fault_inject_cleanup();
 	unregister_filesystem(&nfsd_fs_type);
 	unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
 
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1	FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE	| FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1	0
+#define PNFSD_SUPPORTED_ATTRS_WORD2	0
+#endif /* CONFIG_NFSD_PNFS */
+
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
 	NFSD4_SUPPORTED_ATTRS_WORD0
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-	NFSD4_SUPPORTED_ATTRS_WORD1
+	(NFSD4_SUPPORTED_ATTRS_WORD1	| PNFSD_SUPPORTED_ATTRS_WORD1)
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+	(NFSD4_SUPPORTED_ATTRS_WORD2	| PNFSD_SUPPORTED_ATTRS_WORD2 | \
+	 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
 
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS		FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..84cae2079d21 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
 	return fhp;
 }
 
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+	if (fh1->fh_size != fh2->fh_size)
+		return false;
+	if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+		return false;
+	return true;
+}
+
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+	if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+		return false;
+	if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0))
+		return false;
+	return true;
+}
+
 #ifdef CONFIG_NFSD_V3
 /*
  * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program		nfsd_program = {
 static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
 	[0] = 1,
 	[1] = 1,
+	[2] = 1,
 };
 
 int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..fedb4d620a81
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,81 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+	struct list_head	hash;
+	u64			idx;
+	int			fsid_type;
+	u32			fsid[];
+};
+
+struct nfsd4_layout_ops {
+	u32		notify_types;
+
+	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfsd4_getdeviceinfo *gdevp);
+	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+			struct nfsd4_getdeviceinfo *gdevp);
+
+	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *,
+			struct nfsd4_layoutget *lgp);
+
+	__be32 (*proc_layoutcommit)(struct inode *inode,
+			struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+		struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
 	unsigned char sc_type;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
 	struct list_head	cl_delegations;
 	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
 	struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	cl_lo_states;	/* outstanding layout states */
+#endif
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
 	atomic_t		fi_access[2];
 	u32			fi_share_deny;
 	struct file		*fi_deleg_file;
-	atomic_t		fi_delegees;
+	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	fi_lo_states;
+	atomic_t		fi_lo_recalls;
+#endif
 };
 
 /*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 	return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
 
+struct nfs4_layout_stateid {
+	struct nfs4_stid		ls_stid;
+	struct list_head		ls_perclnt;
+	struct list_head		ls_perfile;
+	spinlock_t			ls_lock;
+	struct list_head		ls_layouts;
+	u32				ls_layout_type;
+	struct file			*ls_file;
+	struct nfsd4_callback		ls_recall;
+	stateid_t			ls_recall_sid;
+	bool				ls_recalled;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
 /* flags for preprocess_seqid_op() */
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_NULL = 0,
 	NFSPROC4_CLNT_CB_RECALL,
+	NFSPROC4_CLNT_CB_LAYOUT,
 	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
@@ -545,6 +572,12 @@ struct nfsd_net;
 extern __be32 nfs4_preprocess_stateid_op(struct net *net,
 		struct nfsd4_compound_state *cstate,
 		stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+		     stateid_t *stateid, unsigned char typemask,
+		     struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+		struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
 							struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
 
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+	atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
+
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
 
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+
+#include "state.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+	TP_PROTO(stateid_t *stp),
+	TP_ARGS(stp),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, si_id)
+		__field(u32, si_generation)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
+		__entry->si_id = stp->si_opaque.so_id;
+		__entry->si_generation = stp->si_generation;
+	),
+	TP_printk("client %08x:%08x stateid %08x:%08x",
+		__entry->cl_boot,
+		__entry->cl_id,
+		__entry->si_id,
+		__entry->si_generation)
+)
+
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+	TP_PROTO(stateid_t *stp), \
+	TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+
+#endif /* _NFSD_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
 
+struct nfsd4_deviceid {
+	u64			fsid_idx;
+	u32			generation;
+	u32			pad;
+};
+
+struct nfsd4_layout_seg {
+	u32			iomode;
+	u64			offset;
+	u64			length;
+};
+
+struct nfsd4_getdeviceinfo {
+	struct nfsd4_deviceid	gd_devid;	/* request */
+	u32			gd_layout_type;	/* request */
+	u32			gd_maxcount;	/* request */
+	u32			gd_notify_types;/* request - response */
+	void			*gd_device;	/* response */
+};
+
+struct nfsd4_layoutget {
+	u64			lg_minlength;	/* request */
+	u32			lg_signal;	/* request */
+	u32			lg_layout_type;	/* request */
+	u32			lg_maxcount;	/* request */
+	stateid_t		lg_sid;		/* request/response */
+	struct nfsd4_layout_seg	lg_seg;		/* request/response */
+	void			*lg_content;	/* response */
+};
+
+struct nfsd4_layoutcommit {
+	stateid_t		lc_sid;		/* request */
+	struct nfsd4_layout_seg	lc_seg;		/* request */
+	u32			lc_reclaim;	/* request */
+	u32			lc_newoffset;	/* request */
+	u64			lc_last_wr;	/* request */
+	struct timespec		lc_mtime;	/* request */
+	u32			lc_layout_type;	/* request */
+	u32			lc_up_len;	/* layout length */
+	void			*lc_up_layout;	/* decoded by callback */
+	u32			lc_size_chg;	/* boolean for response */
+	u64			lc_newsize;	/* response */
+};
+
+struct nfsd4_layoutreturn {
+	u32			lr_return_type;	/* request */
+	u32			lr_layout_type;	/* request */
+	struct nfsd4_layout_seg	lr_seg;		/* request */
+	u32			lr_reclaim;	/* request */
+	u32			lrf_body_len;	/* request */
+	void			*lrf_body;	/* request */
+	stateid_t		lr_sid;		/* request/response */
+	u32			lrs_present;	/* response */
+};
+
 struct nfsd4_fallocate {
 	/* request */
 	stateid_t	falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+		struct nfsd4_getdeviceinfo	getdeviceinfo;
+		struct nfsd4_layoutget		layoutget;
+		struct nfsd4_layoutcommit	layoutcommit;
+		struct nfsd4_layoutreturn	layoutreturn;
 
 		/* NFSv4.2 */
 		struct nfsd4_fallocate		allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
 #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+#define NFS4_enc_cb_layout_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + 3 +                         \
+					enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 41b223a59a63..fa05e04c5531 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 struct dentry;
+struct iattr;
 struct inode;
 struct super_block;
 struct vfsmount;
@@ -180,6 +181,21 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	u64		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+};
+
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
@@ -191,6 +207,13 @@ struct export_operations {
 			struct dentry *child);
 	struct dentry * (*get_parent)(struct dentry *child);
 	int (*commit_metadata)(struct inode *inode);
+
+	int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+	int (*map_blocks)(struct inode *inode, loff_t offset,
+			  u64 len, struct iomap *iomap,
+			  bool write, u32 *device_generation);
+	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
+			     int nr_iomaps, struct iattr *iattr);
 };
 
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f125b88443bd..cdcb1e9d9613 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -873,6 +873,7 @@ static inline struct file *get_file(struct file *f)
 #define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
 #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
+#define FL_LAYOUT	2048	/* outstanding pNFS layout */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -2035,6 +2036,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return ret;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	smp_mb();
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+		return __break_lease(inode,
+				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
+				FL_LAYOUT);
+	return 0;
+}
+
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct file *file)
 {
@@ -2090,6 +2101,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return 0;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index de7c91ca427e..ed43cb74b11d 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -411,6 +411,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
+#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
@@ -517,6 +518,7 @@ enum pnfs_layouttype {
 	LAYOUT_OSD2_OBJECTS = 2,
 	LAYOUT_BLOCK_VOLUME = 3,
 	LAYOUT_FLEX_FILES = 4,
+	LAYOUT_TYPE_MAX
 };
 
 /* used for both layout return and recall */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6f22cfeef5e3..fae6fb947fc8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -110,7 +110,7 @@ struct svc_serv {
  * We use sv_nrthreads as a reference count.  svc_destroy() drops
  * this refcount, so we need to bump it up around operations that
  * change the number of threads.  Horrible, but there it is.
- * Should be called with the BKL held.
+ * Should be called with the "service mutex" held.
  */
 static inline void svc_get(struct svc_serv *serv)
 {
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ddfe88f52219..df8edf8ec914 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -77,6 +77,7 @@ struct svc_rdma_op_ctxt {
 	enum ib_wr_opcode wr_op;
 	enum ib_wc_status wc_status;
 	u32 byte_len;
+	u32 position;
 	struct svcxprt_rdma *xprt;
 	unsigned long flags;
 	enum dma_data_direction direction;
@@ -148,6 +149,10 @@ struct svcxprt_rdma {
 	struct ib_cq         *sc_rq_cq;
 	struct ib_cq         *sc_sq_cq;
 	struct ib_mr         *sc_phys_mr;	/* MR for server memory */
+	int		     (*sc_reader)(struct svcxprt_rdma *,
+					  struct svc_rqst *,
+					  struct svc_rdma_op_ctxt *,
+					  int *, u32 *, u32, u32, u64, bool);
 	u32		     sc_dev_caps;	/* distilled device caps */
 	u32		     sc_dma_lkey;	/* local dma key */
 	unsigned int	     sc_frmr_pg_list_len;
@@ -176,8 +181,6 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
 /* svc_rdma_marshal.c */
-extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
-				      int *, int *);
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
@@ -195,6 +198,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
+extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *,
+			       struct svc_rdma_op_ctxt *, int *, u32 *,
+			       u32, u32, u64, bool);
+extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
+				struct svc_rdma_op_ctxt *, int *, u32 *,
+				u32, u32, u64, bool);
 
 /* svc_rdma_sendto.c */
 extern int svc_rdma_sendto(struct svc_rqst *);
diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h
index 1fdc95bb2375..0bf130a1c58d 100644
--- a/include/uapi/linux/nfsd/debug.h
+++ b/include/uapi/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
 #define NFSDDBG_REPCACHE	0x0080
 #define NFSDDBG_XDR		0x0100
 #define NFSDDBG_LOCKD		0x0200
+#define NFSDDBG_PNFS		0x0400
 #define NFSDDBG_ALL		0x7FFF
 #define NFSDDBG_NOCHANGE	0xFFFF
 
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 584b6ef3a5e8..4742f2cb42f2 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,8 +47,10 @@
  * exported filesystem.
  */
 #define	NFSEXP_V4ROOT		0x10000
+#define NFSEXP_NOPNFS		0x20000
+
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS		0x1FE7F
+#define NFSEXP_ALLFLAGS		0x3FE7F
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 91eaef1844c8..78974e4d9ad2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
 
 /*
- * Called from a server thread as it's exiting. Caller must hold the BKL or
- * the "service mutex", whichever is appropriate for the service.
+ * Called from a server thread as it's exiting. Caller must hold the "service
+ * mutex" for the service.
  */
 void
 svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c69358b3cf7f..163ac45c3639 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list);
  *	svc_pool->sp_lock protects most of the fields of that pool.
  *	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
  *	when both need to be taken (rare), svc_serv->sv_lock is first.
- *	BKL protects svc_serv->sv_nrthread.
+ *	The "service mutex" protects svc_serv->sv_nrthread.
  *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
  *             and the ->sk_info_authunix cache.
  *
@@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list);
  *		  that no other thread will be using the transport or will
  *		  try to set XPT_DEAD.
  */
-
 int svc_reg_xprt_class(struct svc_xprt_class *xcl)
 {
 	struct svc_xprt_class *cl;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 65b146297f5a..b681855cf970 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -71,22 +71,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
 }
 
 /*
- * Determine number of chunks and total bytes in chunk list. The chunk
- * list has already been verified to fit within the RPCRDMA header.
- */
-void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
-			       int *ch_count, int *byte_count)
-{
-	/* compute the number of bytes represented by read chunks */
-	*byte_count = 0;
-	*ch_count = 0;
-	for (; ch->rc_discrim != 0; ch++) {
-		*byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
-		*ch_count = *ch_count + 1;
-	}
-}
-
-/*
  * Decodes a write chunk list. The expected format is as follows:
  *    descrim  : xdr_one
  *    nchunks  : <count>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e0110270d650..f9f13a32ddb8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -43,7 +43,6 @@
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/spinlock.h>
-#include <linux/highmem.h>
 #include <asm/unaligned.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
@@ -60,6 +59,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 			       struct svc_rdma_op_ctxt *ctxt,
 			       u32 byte_count)
 {
+	struct rpcrdma_msg *rmsgp;
 	struct page *page;
 	u32 bc;
 	int sge_no;
@@ -82,7 +82,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	/* If data remains, store it in the pagelist */
 	rqstp->rq_arg.page_len = bc;
 	rqstp->rq_arg.page_base = 0;
-	rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+	if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+		rqstp->rq_arg.pages = &rqstp->rq_pages[0];
+	else
+		rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
 	sge_no = 1;
 	while (bc && sge_no < ctxt->count) {
 		page = ctxt->pages[sge_no];
@@ -95,14 +102,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	/* We should never run out of SGE because the limit is defined to
-	 * support the max allowed RPC data length
-	 */
-	BUG_ON(bc && (sge_no == ctxt->count));
-	BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
-	       != byte_count);
-	BUG_ON(rqstp->rq_arg.len != byte_count);
-
 	/* If not all pages were used from the SGL, free the remaining ones */
 	bc = sge_no;
 	while (sge_no < ctxt->count) {
@@ -125,26 +124,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
 		return min_t(int, sge_count, xprt->sc_max_sge);
 }
 
-typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
-			      struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *head,
-			      int *page_no,
-			      u32 *page_offset,
-			      u32 rs_handle,
-			      u32 rs_length,
-			      u64 rs_offset,
-			      int last);
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
-static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
-			       struct svc_rqst *rqstp,
-			       struct svc_rdma_op_ctxt *head,
-			       int *page_no,
-			       u32 *page_offset,
-			       u32 rs_handle,
-			       u32 rs_length,
-			       u64 rs_offset,
-			       int last)
+int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+			struct svc_rqst *rqstp,
+			struct svc_rdma_op_ctxt *head,
+			int *page_no,
+			u32 *page_offset,
+			u32 rs_handle,
+			u32 rs_length,
+			u64 rs_offset,
+			bool last)
 {
 	struct ib_send_wr read_wr;
 	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
@@ -229,15 +218,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 }
 
 /* Issue an RDMA_READ using an FRMR to map the data sink */
-static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
-				struct svc_rqst *rqstp,
-				struct svc_rdma_op_ctxt *head,
-				int *page_no,
-				u32 *page_offset,
-				u32 rs_handle,
-				u32 rs_length,
-				u64 rs_offset,
-				int last)
+int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+			 struct svc_rqst *rqstp,
+			 struct svc_rdma_op_ctxt *head,
+			 int *page_no,
+			 u32 *page_offset,
+			 u32 rs_handle,
+			 u32 rs_length,
+			 u64 rs_offset,
+			 bool last)
 {
 	struct ib_send_wr read_wr;
 	struct ib_send_wr inv_wr;
@@ -365,24 +354,84 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
 	return ret;
 }
 
+static unsigned int
+rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
+{
+	unsigned int count;
+
+	for (count = 0; ch->rc_discrim != xdr_zero; ch++)
+		count++;
+	return count;
+}
+
+/* If there was additional inline content, append it to the end of arg.pages.
+ * Tail copy has to be done after the reader function has determined how many
+ * pages are needed for RDMA READ.
+ */
+static int
+rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
+	       u32 position, u32 byte_count, u32 page_offset, int page_no)
+{
+	char *srcp, *destp;
+	int ret;
+
+	ret = 0;
+	srcp = head->arg.head[0].iov_base + position;
+	byte_count = head->arg.head[0].iov_len - position;
+	if (byte_count > PAGE_SIZE) {
+		dprintk("svcrdma: large tail unsupported\n");
+		return 0;
+	}
+
+	/* Fit as much of the tail on the current page as possible */
+	if (page_offset != PAGE_SIZE) {
+		destp = page_address(rqstp->rq_arg.pages[page_no]);
+		destp += page_offset;
+		while (byte_count--) {
+			*destp++ = *srcp++;
+			page_offset++;
+			if (page_offset == PAGE_SIZE && byte_count)
+				goto more;
+		}
+		goto done;
+	}
+
+more:
+	/* Fit the rest on the next page */
+	page_no++;
+	destp = page_address(rqstp->rq_arg.pages[page_no]);
+	while (byte_count--)
+		*destp++ = *srcp++;
+
+	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+done:
+	byte_count = head->arg.head[0].iov_len - position;
+	head->arg.page_len += byte_count;
+	head->arg.len += byte_count;
+	head->arg.buflen += byte_count;
+	return 1;
+}
+
 static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			    struct rpcrdma_msg *rmsgp,
 			    struct svc_rqst *rqstp,
 			    struct svc_rdma_op_ctxt *head)
 {
-	int page_no, ch_count, ret;
+	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
-	u32 page_offset, byte_count;
+	u32 handle, page_offset, byte_count;
+	u32 position;
 	u64 rs_offset;
-	rdma_reader_fn reader;
+	bool last;
 
 	/* If no read list is present, return 0 */
 	ch = svc_rdma_get_read_chunk(rmsgp);
 	if (!ch)
 		return 0;
 
-	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
-	if (ch_count > RPCSVC_MAXPAGES)
+	if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
 		return -EINVAL;
 
 	/* The request is completed when the RDMA_READs complete. The
@@ -391,34 +440,41 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	 */
 	head->arg.head[0] = rqstp->rq_arg.head[0];
 	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = &head->pages[head->count];
 	head->hdr_count = head->count;
 	head->arg.page_base = 0;
 	head->arg.page_len = 0;
 	head->arg.len = rqstp->rq_arg.len;
 	head->arg.buflen = rqstp->rq_arg.buflen;
 
-	/* Use FRMR if supported */
-	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
-		reader = rdma_read_chunk_frmr;
-	else
-		reader = rdma_read_chunk_lcl;
+	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	position = be32_to_cpu(ch->rc_position);
+
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	if (position == 0) {
+		head->arg.pages = &head->pages[0];
+		page_offset = head->byte_len;
+	} else {
+		head->arg.pages = &head->pages[head->count];
+		page_offset = 0;
+	}
 
-	page_no = 0; page_offset = 0;
-	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-	     ch->rc_discrim != 0; ch++) {
+	ret = 0;
+	page_no = 0;
+	for (; ch->rc_discrim != xdr_zero; ch++) {
+		if (be32_to_cpu(ch->rc_position) != position)
+			goto err;
 
+		handle = be32_to_cpu(ch->rc_target.rs_handle),
+		byte_count = be32_to_cpu(ch->rc_target.rs_length);
 		xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
 				 &rs_offset);
-		byte_count = ntohl(ch->rc_target.rs_length);
 
 		while (byte_count > 0) {
-			ret = reader(xprt, rqstp, head,
-				     &page_no, &page_offset,
-				     ntohl(ch->rc_target.rs_handle),
-				     byte_count, rs_offset,
-				     ((ch+1)->rc_discrim == 0) /* last */
-				     );
+			last = (ch + 1)->rc_discrim == xdr_zero;
+			ret = xprt->sc_reader(xprt, rqstp, head,
+					      &page_no, &page_offset,
+					      handle, byte_count,
+					      rs_offset, last);
 			if (ret < 0)
 				goto err;
 			byte_count -= ret;
@@ -426,7 +482,24 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			head->arg.buflen += ret;
 		}
 	}
+
+	/* Read list may need XDR round-up (see RFC 5666, s. 3.7) */
+	if (page_offset & 3) {
+		u32 pad = 4 - (page_offset & 3);
+
+		head->arg.page_len += pad;
+		head->arg.len += pad;
+		head->arg.buflen += pad;
+		page_offset += pad;
+	}
+
 	ret = 1;
+	if (position && position < head->arg.head[0].iov_len)
+		ret = rdma_copy_tail(rqstp, head, position,
+				     byte_count, page_offset, page_no);
+	head->arg.head[0].iov_len = position;
+	head->position = position;
+
  err:
 	/* Detach arg pages. svc_recv will replenish them */
 	for (page_no = 0;
@@ -436,47 +509,33 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	return ret;
 }
 
-/*
- * To avoid a separate RDMA READ just for a handful of zero bytes,
- * RFC 5666 section 3.7 allows the client to omit the XDR zero pad
- * in chunk lists.
- */
-static void
-rdma_fix_xdr_pad(struct xdr_buf *buf)
-{
-	unsigned int page_len = buf->page_len;
-	unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len;
-	unsigned int offset, pg_no;
-	char *p;
-
-	if (size == 0)
-		return;
-
-	pg_no = page_len >> PAGE_SHIFT;
-	offset = page_len & ~PAGE_MASK;
-	p = page_address(buf->pages[pg_no]);
-	memset(p + offset, 0, size);
-
-	buf->page_len += size;
-	buf->buflen += size;
-	buf->len += size;
-}
-
 static int rdma_read_complete(struct svc_rqst *rqstp,
 			      struct svc_rdma_op_ctxt *head)
 {
 	int page_no;
 	int ret;
 
-	BUG_ON(!head);
-
 	/* Copy RPC pages */
 	for (page_no = 0; page_no < head->count; page_no++) {
 		put_page(rqstp->rq_pages[page_no]);
 		rqstp->rq_pages[page_no] = head->pages[page_no];
 	}
+
+	/* Adjustments made for RDMA_NOMSG type requests */
+	if (head->position == 0) {
+		if (head->arg.len <= head->sge[0].length) {
+			head->arg.head[0].iov_len = head->arg.len -
+							head->byte_len;
+			head->arg.page_len = 0;
+		} else {
+			head->arg.head[0].iov_len = head->sge[0].length -
+								head->byte_len;
+			head->arg.page_len = head->arg.len -
+						head->sge[0].length;
+		}
+	}
+
 	/* Point rq_arg.pages past header */
-	rdma_fix_xdr_pad(&head->arg);
 	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
 	rqstp->rq_arg.page_len = head->arg.page_len;
 	rqstp->rq_arg.page_base = head->arg.page_base;
@@ -501,8 +560,8 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
 		+ rqstp->rq_arg.tail[0].iov_len;
-	dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
-		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+	dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
+		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
 		ret, rqstp->rq_arg.len,	rqstp->rq_arg.head[0].iov_base,
 		rqstp->rq_arg.head[0].iov_len);
 
@@ -558,7 +617,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	}
 	dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
 		ctxt, rdma_xprt, rqstp, ctxt->wc_status);
-	BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
 	atomic_inc(&rdma_stat_recv);
 
 	/* Build up the XDR from the receive buffers. */
@@ -591,8 +649,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		+ rqstp->rq_arg.tail[0].iov_len;
 	svc_rdma_put_context(ctxt, 0);
  out:
-	dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
-		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+	dprintk("svcrdma: ret=%d, rq_arg.len=%u, "
+		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n",
 		ret, rqstp->rq_arg.len,
 		rqstp->rq_arg.head[0].iov_base,
 		rqstp->rq_arg.head[0].iov_len);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 9f1b50689c0f..7de33d1af9b6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -60,8 +60,11 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 	u32 page_off;
 	int page_no;
 
-	BUG_ON(xdr->len !=
-	       (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+	if (xdr->len !=
+	    (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
+		pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+		return -EIO;
+	}
 
 	/* Skip the first sge, this is for the RPCRDMA header */
 	sge_no = 1;
@@ -150,7 +153,11 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 	int bc;
 	struct svc_rdma_op_ctxt *ctxt;
 
-	BUG_ON(vec->count > RPCSVC_MAXPAGES);
+	if (vec->count > RPCSVC_MAXPAGES) {
+		pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+		return -EIO;
+	}
+
 	dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
 		"write_len=%d, vec->sge=%p, vec->count=%lu\n",
 		rmr, (unsigned long long)to, xdr_off,
@@ -190,7 +197,10 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 		sge_off = 0;
 		sge_no++;
 		xdr_sge_no++;
-		BUG_ON(xdr_sge_no > vec->count);
+		if (xdr_sge_no > vec->count) {
+			pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
+			goto err;
+		}
 		bc -= sge_bytes;
 		if (sge_no == xprt->sc_max_sge)
 			break;
@@ -421,7 +431,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
 		ctxt->sge[sge_no].length = sge_bytes;
 	}
-	BUG_ON(byte_count != 0);
+	if (byte_count != 0) {
+		pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+		goto err;
+	}
 
 	/* Save all respages in the ctxt and remove them from the
 	 * respages array. They are our pages until the I/O
@@ -442,7 +455,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	BUG_ON(sge_no > rdma->sc_max_sge);
+	if (sge_no > rdma->sc_max_sge) {
+		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+		goto err;
+	}
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
 	send_wr.wr_id = (unsigned long)ctxt;
@@ -467,18 +483,6 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
 {
 }
 
-/*
- * Return the start of an xdr buffer.
- */
-static void *xdr_start(struct xdr_buf *xdr)
-{
-	return xdr->head[0].iov_base -
-		(xdr->len -
-		 xdr->page_len -
-		 xdr->tail[0].iov_len -
-		 xdr->head[0].iov_len);
-}
-
 int svc_rdma_sendto(struct svc_rqst *rqstp)
 {
 	struct svc_xprt *xprt = rqstp->rq_xprt;
@@ -496,8 +500,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
-	/* Get the RDMA request header. */
-	rdma_argp = xdr_start(&rqstp->rq_arg);
+	/* Get the RDMA request header. The receive logic always
+	 * places this at the start of page 0.
+	 */
+	rdma_argp = page_address(rqstp->rq_pages[0]);
 
 	/* Build an req vec for the XDR */
 	ctxt = svc_rdma_get_context(rdma);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4e618808bc98..f609c1c2d38d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -139,7 +139,6 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 	struct svcxprt_rdma *xprt;
 	int i;
 
-	BUG_ON(!ctxt);
 	xprt = ctxt->xprt;
 	if (free_pages)
 		for (i = 0; i < ctxt->count; i++)
@@ -339,12 +338,14 @@ static void process_context(struct svcxprt_rdma *xprt,
 
 	switch (ctxt->wr_op) {
 	case IB_WR_SEND:
-		BUG_ON(ctxt->frmr);
+		if (ctxt->frmr)
+			pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
 		svc_rdma_put_context(ctxt, 1);
 		break;
 
 	case IB_WR_RDMA_WRITE:
-		BUG_ON(ctxt->frmr);
+		if (ctxt->frmr)
+			pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
 		svc_rdma_put_context(ctxt, 0);
 		break;
 
@@ -353,19 +354,21 @@ static void process_context(struct svcxprt_rdma *xprt,
 		svc_rdma_put_frmr(xprt, ctxt->frmr);
 		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
 			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
-			BUG_ON(!read_hdr);
-			spin_lock_bh(&xprt->sc_rq_dto_lock);
-			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-			list_add_tail(&read_hdr->dto_q,
-				      &xprt->sc_read_complete_q);
-			spin_unlock_bh(&xprt->sc_rq_dto_lock);
+			if (read_hdr) {
+				spin_lock_bh(&xprt->sc_rq_dto_lock);
+				set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+				list_add_tail(&read_hdr->dto_q,
+					      &xprt->sc_read_complete_q);
+				spin_unlock_bh(&xprt->sc_rq_dto_lock);
+			} else {
+				pr_err("svcrdma: ctxt->read_hdr == NULL\n");
+			}
 			svc_xprt_enqueue(&xprt->sc_xprt);
 		}
 		svc_rdma_put_context(ctxt, 0);
 		break;
 
 	default:
-		BUG_ON(1);
 		printk(KERN_ERR "svcrdma: unexpected completion type, "
 		       "opcode=%d\n",
 		       ctxt->wr_op);
@@ -513,7 +516,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 	buflen = 0;
 	ctxt->direction = DMA_FROM_DEVICE;
 	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
-		BUG_ON(sge_no >= xprt->sc_max_sge);
+		if (sge_no >= xprt->sc_max_sge) {
+			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+			goto err_put_ctxt;
+		}
 		page = svc_rdma_get_page();
 		ctxt->pages[sge_no] = page;
 		pa = ib_dma_map_page(xprt->sc_cm_id->device,
@@ -687,7 +693,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 {
 	struct rdma_cm_id *listen_id;
 	struct svcxprt_rdma *cma_xprt;
-	struct svc_xprt *xprt;
 	int ret;
 
 	dprintk("svcrdma: Creating RDMA socket\n");
@@ -698,7 +703,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 	cma_xprt = rdma_create_xprt(serv, 1);
 	if (!cma_xprt)
 		return ERR_PTR(-ENOMEM);
-	xprt = &cma_xprt->sc_xprt;
 
 	listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
 				   IB_QPT_RC);
@@ -822,7 +826,7 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
 	if (frmr) {
 		frmr_unmap_dma(rdma, frmr);
 		spin_lock_bh(&rdma->sc_frmr_q_lock);
-		BUG_ON(!list_empty(&frmr->frmr_list));
+		WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
 		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
 		spin_unlock_bh(&rdma->sc_frmr_q_lock);
 	}
@@ -970,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 * NB:	iWARP requires remote write access for the data sink
 	 *	of an RDMA_READ. IB does not.
 	 */
+	newxprt->sc_reader = rdma_read_chunk_lcl;
 	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		newxprt->sc_frmr_pg_list_len =
 			devattr.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+		newxprt->sc_reader = rdma_read_chunk_frmr;
 	}
 
 	/*
@@ -1125,7 +1131,9 @@ static void __svc_rdma_free(struct work_struct *work)
 	dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
 
 	/* We should only be called from kref_put */
-	BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
+	if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
+		       atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
 
 	/*
 	 * Destroy queued, but not processed read completions. Note
@@ -1153,8 +1161,12 @@ static void __svc_rdma_free(struct work_struct *work)
 	}
 
 	/* Warn if we leaked a resource or under-referenced */
-	WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
-	WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+	if (atomic_read(&rdma->sc_ctxt_used) != 0)
+		pr_err("svcrdma: ctxt still in use? (%d)\n",
+		       atomic_read(&rdma->sc_ctxt_used));
+	if (atomic_read(&rdma->sc_dma_used) != 0)
+		pr_err("svcrdma: dma still in use? (%d)\n",
+		       atomic_read(&rdma->sc_dma_used));
 
 	/* De-allocate fastreg mr */
 	rdma_dealloc_frmr_q(rdma);
@@ -1254,7 +1266,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
 		return -ENOTCONN;
 
-	BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
 	wr_count = 1;
 	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
 		wr_count++;