summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt23
-rw-r--r--Documentation/filesystems/nfs/pnfs-block-server.txt37
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/lockd/xdr.c8
-rw-r--r--fs/locks.c26
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile8
-rw-r--r--fs/nfsd/blocklayout.c189
-rw-r--r--fs/nfsd/blocklayoutxdr.c157
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/nfs4callback.c99
-rw-r--r--fs/nfsd/nfs4layouts.c721
-rw-r--r--fs/nfsd/nfs4proc.c310
-rw-r--r--fs/nfsd/nfs4state.c76
-rw-r--r--fs/nfsd/nfs4xdr.c362
-rw-r--r--fs/nfsd/nfsctl.c9
-rw-r--r--fs/nfsd/nfsd.h16
-rw-r--r--fs/nfsd/nfsfh.h18
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nfsd/pnfs.h81
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/trace.c5
-rw-r--r--fs/nfsd/trace.h54
-rw-r--r--fs/nfsd/xdr4.h59
-rw-r--r--fs/nfsd/xdr4cb.h7
-rw-r--r--include/linux/exportfs.h23
-rw-r--r--include/linux/fs.h16
-rw-r--r--include/linux/nfs4.h2
-rw-r--r--include/linux/sunrpc/svc.h2
-rw-r--r--include/linux/sunrpc/svc_rdma.h13
-rw-r--r--include/uapi/linux/nfsd/debug.h1
-rw-r--r--include/uapi/linux/nfsd/export.h4
-rw-r--r--net/sunrpc/svc.c4
-rw-r--r--net/sunrpc/svc_xprt.c3
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c16
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c244
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c46
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c47
40 files changed, 2562 insertions, 254 deletions
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index c49cd7e796e7..682a59fabe3f 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -24,11 +24,6 @@ focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
"exactly once" semantics and better control and throttling of the
resources allocated for each client.
-Other NFSv4.1 features, Parallel NFS operations in particular,
-are still under development out of tree.
-See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
-for more information.
-
The table below, taken from the NFSv4.1 document, lists
the operations that are mandatory to implement (REQ), optional
(OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -43,9 +38,7 @@ The OPTIONAL features identified and their abbreviations are as follows:
The following abbreviations indicate the linux server implementation status.
I Implemented NFSv4.1 operations.
NS Not Supported.
- NS* unimplemented optional feature.
- P pNFS features implemented out of tree.
- PNS pNFS features that are not supported yet (out of tree).
+ NS* Unimplemented optional feature.
Operations
@@ -70,13 +63,13 @@ I | DESTROY_SESSION | REQ | | Section 18.37 |
I | EXCHANGE_ID | REQ | | Section 18.35 |
I | FREE_STATEID | REQ | | Section 18.38 |
| GETATTR | REQ | | Section 18.7 |
-P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
-P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
+I | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
+NS*| GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
| GETFH | REQ | | Section 18.8 |
NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 |
-P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
-P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
-P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
+I | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
+I | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
+I | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
| LINK | OPT | | Section 18.9 |
| LOCK | REQ | | Section 18.10 |
| LOCKT | REQ | | Section 18.11 |
@@ -122,9 +115,9 @@ Callback Operations
| | MNI | or OPT) | |
+-------------------------+-----------+-------------+---------------+
| CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 |
-P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
+I | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 |
-P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
+NS*| CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 |
NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 |
| CB_RECALL | OPT | FDELG, | Section 20.2 |
diff --git a/Documentation/filesystems/nfs/pnfs-block-server.txt b/Documentation/filesystems/nfs/pnfs-block-server.txt
new file mode 100644
index 000000000000..2143673cf154
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs-block-server.txt
@@ -0,0 +1,37 @@
+pNFS block layout server user guide
+
+The Linux NFS server now supports the pNFS block layout extension. In this
+case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
+to handling all the metadata access to the NFS export also hands out layouts
+to the clients to directly access the underlying block devices that are
+shared with the client.
+
+To use pNFS block layouts with with the Linux NFS server the exported file
+system needs to support the pNFS block layouts (currently just XFS), and the
+file system must sit on shared storage (typically iSCSI) that is accessible
+to the clients in addition to the MDS. As of now the file system needs to
+sit directly on the exported volume, striping or concatenation of
+volumes on the MDS and clients is not supported yet.
+
+On the server, pNFS block volume support is automatically if the file system
+support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK
+option enabled, the blkmapd daemon from nfs-utils is running, and the
+file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).
+
+If the nfsd server needs to fence a non-responding client it calls
+/sbin/nfsd-recall-failed with the first argument set to the IP address of
+the client, and the second argument set to the device node without the /dev
+prefix for the file system to be fenced. Below is an example file that shows
+how to translate the device into a serial number from SCSI EVPD 0x80:
+
+cat > /sbin/nfsd-recall-failed << EOF
+#!/bin/sh
+
+CLIENT="$1"
+DEV="/dev/$2"
+EVPD=`sg_inq --page=0x80 ${DEV} | \
+ grep "Unit serial number:" | \
+ awk -F ': ' '{print $2}'`
+
+echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
+EOF
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
{
/*
- * We can get away with a static buffer because we're only
- * called with BKL held.
+ * We can get away with a static buffer because this is only called
+ * from lockd, which is single-threaded.
*/
static char buf[2*NLM_MAXCOOKIELEN+1];
unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
return p + XDR_QUADLEN(NFS2_FHSIZE);
}
-static inline __be32 *
-nlm_encode_fh(__be32 *p, struct nfs_fh *f)
-{
- *p++ = htonl(NFS2_FHSIZE);
- memcpy(p, f->data, NFS2_FHSIZE);
- return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
-
/*
* Encode and decode owner handle
*/
diff --git a/fs/locks.c b/fs/locks.c
index 4d0d41163a50..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
static bool lease_breaking(struct file_lock *fl)
@@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
{
+ if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+ return false;
if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
return false;
return locks_conflict(breaker, lease);
@@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
* conflict with the lease we're trying to set.
*/
static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
{
int ret = 0;
struct inode *inode = dentry->d_inode;
+ if (flags & FL_LAYOUT)
+ return 0;
+
if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
return -EAGAIN;
@@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
- error = check_conflicting_open(dentry, arg);
+ error = check_conflicting_open(dentry, arg, lease->fl_flags);
if (error)
goto out;
@@ -1661,7 +1666,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
*/
error = -EAGAIN;
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file == filp) {
+ if (fl->fl_file == filp &&
+ fl->fl_owner == lease->fl_owner) {
my_fl = fl;
continue;
}
@@ -1702,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
* precedes these checks.
*/
smp_mb();
- error = check_conflicting_open(dentry, arg);
+ error = check_conflicting_open(dentry, arg, lease->fl_flags);
if (error) {
locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
goto out;
@@ -1721,7 +1727,7 @@ out:
return error;
}
-static int generic_delete_lease(struct file *filp)
+static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
struct file_lock *fl, *victim = NULL;
@@ -1737,7 +1743,8 @@ static int generic_delete_lease(struct file *filp)
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file == filp) {
+ if (fl->fl_file == filp &&
+ fl->fl_owner == owner) {
victim = fl;
break;
}
@@ -1778,13 +1785,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
switch (arg) {
case F_UNLCK:
- return generic_delete_lease(filp);
+ return generic_delete_lease(filp, *priv);
case F_RDLCK:
case F_WRLCK:
if (!(*flp)->fl_lmops->lm_break) {
WARN_ON_ONCE(1);
return -ENOLCK;
}
+
return generic_add_lease(filp, arg, flp, priv);
default:
return -EINVAL;
@@ -1857,7 +1865,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
if (arg == F_UNLCK)
- return vfs_setlease(filp, F_UNLCK, NULL, NULL);
+ return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
return do_fcntl_add_lease(fd, filp, arg);
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
If unsure, say N.
+config NFSD_PNFS
+ bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+ depends on NFSD_V4
+ help
+ This option enables support for the parallel NFS features of the
+ minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+ server.
+
+ If unsure, say N.
+
config NFSD_V4_SECURITY_LABEL
bool "Provide Security Label support for NFSv4 server"
depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
# Makefile for the Linux nfs server
#
+ccflags-y += -I$(src) # needed for trace events
+
obj-$(CONFIG_NFSD) += nfsd.o
-nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y += trace.o
+
+nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+ b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+ return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+ &b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (sb->s_bdev != sb->s_bdev->bd_contains)
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *args)
+{
+ struct nfsd4_layout_seg *seg = &args->lg_seg;
+ struct super_block *sb = inode->i_sb;
+ u32 block_size = (1 << inode->i_blkbits);
+ struct pnfs_block_extent *bex;
+ struct iomap iomap;
+ u32 device_generation = 0;
+ int error;
+
+ /*
+ * We do not attempt to support I/O smaller than the fs block size,
+ * or not aligned to it.
+ */
+ if (args->lg_minlength < block_size) {
+ dprintk("pnfsd: I/O too small\n");
+ goto out_layoutunavailable;
+ }
+ if (seg->offset & (block_size - 1)) {
+ dprintk("pnfsd: I/O misaligned\n");
+ goto out_layoutunavailable;
+ }
+
+ /*
+ * Some clients barf on non-zero block numbers for NONE or INVALID
+ * layouts, so make sure to zero the whole structure.
+ */
+ error = -ENOMEM;
+ bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+ if (!bex)
+ goto out_error;
+ args->lg_content = bex;
+
+ error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+ &iomap, seg->iomode != IOMODE_READ,
+ &device_generation);
+ if (error) {
+ if (error == -ENXIO)
+ goto out_layoutunavailable;
+ goto out_error;
+ }
+
+ if (iomap.length < args->lg_minlength) {
+ dprintk("pnfsd: extent smaller than minlength\n");
+ goto out_layoutunavailable;
+ }
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ if (seg->iomode == IOMODE_READ)
+ bex->es = PNFS_BLOCK_READ_DATA;
+ else
+ bex->es = PNFS_BLOCK_READWRITE_DATA;
+ bex->soff = (iomap.blkno << 9);
+ break;
+ case IOMAP_UNWRITTEN:
+ if (seg->iomode & IOMODE_RW) {
+ /*
+ * Crack monkey special case from section 2.3.1.
+ */
+ if (args->lg_minlength == 0) {
+ dprintk("pnfsd: no soup for you!\n");
+ goto out_layoutunavailable;
+ }
+
+ bex->es = PNFS_BLOCK_INVALID_DATA;
+ bex->soff = (iomap.blkno << 9);
+ break;
+ }
+ /*FALLTHRU*/
+ case IOMAP_HOLE:
+ if (seg->iomode == IOMODE_READ) {
+ bex->es = PNFS_BLOCK_NONE_DATA;
+ break;
+ }
+ /*FALLTHRU*/
+ case IOMAP_DELALLOC:
+ default:
+ WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+ goto out_layoutunavailable;
+ }
+
+ error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+ if (error)
+ goto out_error;
+ bex->foff = iomap.offset;
+ bex->len = iomap.length;
+
+ seg->offset = iomap.offset;
+ seg->length = iomap.length;
+
+ dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+ return 0;
+
+out_error:
+ seg->length = 0;
+ return nfserrno(error);
+out_layoutunavailable:
+ seg->length = 0;
+ return nfserr_layoutunavailable;
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ loff_t new_size = lcp->lc_last_wr + 1;
+ struct iattr iattr = { .ia_valid = 0 };
+ struct iomap *iomaps;
+ int nr_iomaps;
+ int error;
+
+ nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+ timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+ lcp->lc_mtime = current_fs_time(inode->i_sb);
+ iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+ iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+
+ if (new_size > i_size_read(inode)) {
+ iattr.ia_valid |= ATTR_SIZE;
+ iattr.ia_size = new_size;
+ }
+
+ error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+ nr_iomaps, &iattr);
+ kfree(iomaps);
+ return nfserrno(error);
+}
+
+const struct nfsd4_layout_ops bl_layout_ops = {
+ .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_block_proc_layoutget,
+ .encode_layoutget = nfsd4_block_encode_layoutget,
+ .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
+};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct pnfs_block_extent *b = lgp->lg_content;
+ int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+ if (!p)
+ return nfserr_toosmall;
+
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(1); /* we always return a single extent */
+
+ p = xdr_encode_opaque_fixed(p, &b->vol_id,
+ sizeof(struct nfsd4_deviceid));
+ p = xdr_encode_hyper(p, b->foff);
+ p = xdr_encode_hyper(p, b->len);
+ p = xdr_encode_hyper(p, b->soff);
+ *p++ = cpu_to_be32(b->es);
+ return 0;
+}
+
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int len;
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+ p = xdr_reserve_space(xdr, len);
+ if (!p)
+ return -ETOOSMALL;
+
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(1); /* single signature */
+ p = xdr_encode_hyper(p, b->simple.offset);
+ p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ return len;
+}
+
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+ int len = sizeof(__be32), ret, i;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, len + sizeof(__be32));
+ if (!p)
+ return nfserr_resource;
+
+ for (i = 0; i < dev->nr_volumes; i++) {
+ ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+ if (ret < 0)
+ return nfserrno(ret);
+ len += ret;
+ }
+
+ /*
+ * Fill in the overall length and number of volumes at the beginning
+ * of the layout.
+ */
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(dev->nr_volumes);
+ return 0;
+}
+
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size)
+{
+ struct iomap *iomaps;
+ u32 nr_iomaps, expected, i;
+
+ if (len < sizeof(u32)) {
+ dprintk("%s: extent array too small: %u\n", __func__, len);
+ return -EINVAL;
+ }
+
+ nr_iomaps = be32_to_cpup(p++);
+ expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+ if (len != expected) {
+ dprintk("%s: extent array size mismatch: %u/%u\n",
+ __func__, len, expected);
+ return -EINVAL;
+ }
+
+ iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+ if (!iomaps) {
+ dprintk("%s: failed to allocate extent array\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_iomaps; i++) {
+ struct pnfs_block_extent bex;
+
+ memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+ p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+
+ p = xdr_decode_hyper(p, &bex.foff);
+ if (bex.foff & (block_size - 1)) {
+ dprintk("%s: unaligned offset %lld\n",
+ __func__, bex.foff);
+ goto fail;
+ }
+ p = xdr_decode_hyper(p, &bex.len);
+ if (bex.len & (block_size - 1)) {
+ dprintk("%s: unaligned length %lld\n",
+ __func__, bex.foff);
+ goto fail;
+ }
+ p = xdr_decode_hyper(p, &bex.soff);
+ if (bex.soff & (block_size - 1)) {
+ dprintk("%s: unaligned disk offset %lld\n",
+ __func__, bex.soff);
+ goto fail;
+ }
+ bex.es = be32_to_cpup(p++);
+ if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+ dprintk("%s: incorrect extent state %d\n",
+ __func__, bex.es);
+ goto fail;
+ }
+
+ iomaps[i].offset = bex.foff;
+ iomaps[i].length = bex.len;
+ }
+
+ *iomapp = iomaps;
+ return nr_iomaps;
+fail:
+ kfree(iomaps);
+ return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+
+#include <linux/blkdev.h>
+#include "xdr4.h"
+
+struct iomap;
+struct xdr_stream;
+
+enum pnfs_block_extent_state {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2,
+ PNFS_BLOCK_NONE_DATA = 3,
+};
+
+struct pnfs_block_extent {
+ struct nfsd4_deviceid vol_id;
+ u64 foff;
+ u64 len;
+ u64 soff;
+ enum pnfs_block_extent_state es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE 44
+
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
+};
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } simple;
+ };
+};
+
+struct pnfs_block_deviceaddr {
+ u32 nr_volumes;
+ struct pnfs_block_volume volumes[];
+};
+
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size);
+
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
#include "nfsd.h"
#include "nfsfh.h"
#include "netns.h"
+#include "pnfs.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
exp.cd = cd;
+ exp.ex_devid_map = NULL;
/* expiry */
err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if (!gid_valid(exp.ex_anon_gid))
goto out4;
err = 0;
+
+ nfsd4_setup_layout_type(&exp);
}
expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
+ new->ex_layout_type = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
}
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
new->ex_anon_uid = item->ex_anon_uid;
new->ex_anon_gid = item->ex_anon_gid;
new->ex_fsid = item->ex_fsid;
+ new->ex_devid_map = item->ex_devid_map;
+ item->ex_devid_map = NULL;
new->ex_uuid = item->ex_uuid;
item->ex_uuid = NULL;
new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
item->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = item->ex_fslocs.migrated;
item->ex_fslocs.migrated = 0;
+ new->ex_layout_type = item->ex_layout_type;
new->ex_nflavors = item->ex_nflavors;
for (i = 0; i < MAX_SECINFO_LIST; i++) {
new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
struct nfsd4_fs_locations ex_fslocs;
uint32_t ex_nflavors;
struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
+ enum pnfs_layouttype ex_layout_type;
+ struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
};
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
return status;
}
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ * struct layoutrecall_file4 {
+ * nfs_fh4 lor_fh;
+ * offset4 lor_offset;
+ * length4 lor_length;
+ * stateid4 lor_stateid;
+ * };
+ *
+ * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ * case LAYOUTRECALL4_FILE:
+ * layoutrecall_file4 lor_layout;
+ * case LAYOUTRECALL4_FSID:
+ * fsid4 lor_fsid;
+ * case LAYOUTRECALL4_ALL:
+ * void;
+ * };
+ *
+ * struct CB_LAYOUTRECALL4args {
+ * layouttype4 clora_type;
+ * layoutiomode4 clora_iomode;
+ * bool clora_changed;
+ * layoutrecall4 clora_recall;
+ * };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+ const struct nfs4_layout_stateid *ls,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUG_ON(hdr->minorversion == 0);
+
+ p = xdr_reserve_space(xdr, 5 * 4);
+ *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+ *p++ = cpu_to_be32(ls->ls_layout_type);
+ *p++ = cpu_to_be32(IOMODE_ANY);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(RETURN_FILE);
+
+ encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+
+ p = xdr_reserve_space(xdr, 2 * 8);
+ p = xdr_encode_hyper(p, 0);
+ xdr_encode_hyper(p, NFS4_MAX_UINT64);
+
+ encode_stateid4(xdr, &ls->ls_recall_sid);
+
+ hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfsd4_callback *cb)
+{
+ const struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_layout4args(xdr, ls, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_compound_hdr hdr;
+ enum nfsstat4 nfserr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ goto out;
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status))
+ goto out;
+ }
+ status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+ if (unlikely(status))
+ goto out;
+ if (unlikely(nfserr != NFS4_OK))
+ status = nfs_cb_stat_to_errno(nfserr);
+out:
+ return status;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
/*
* RPC procedure tables
*/
@@ -563,6 +659,9 @@ out:
static struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_NULL, NULL, cb_null, cb_null),
PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+ PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
+#endif
};
static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct nfs4_layout {
+ struct list_head lo_perstate;
+ struct nfs4_layout_stateid *lo_state;
+ struct nfsd4_layout_seg lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+ [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS 8
+#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+ return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+ const struct knfsd_fh *fh = &fhp->fh_handle;
+ size_t fsid_len = key_len(fh->fh_fsid_type);
+ struct nfsd4_deviceid_map *map, *old;
+ int i;
+
+ map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+ if (!map)
+ return;
+
+ map->fsid_type = fh->fh_fsid_type;
+ memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+ spin_lock(&nfsd_devid_lock);
+ if (fhp->fh_export->ex_devid_map)
+ goto out_unlock;
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++) {
+ list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+ if (old->fsid_type != fh->fh_fsid_type)
+ continue;
+ if (memcmp(old->fsid, fh->fh_fsid,
+ key_len(old->fsid_type)))
+ continue;
+
+ fhp->fh_export->ex_devid_map = old;
+ goto out_unlock;
+ }
+ }
+
+ map->idx = nfsd_devid_seq++;
+ list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+ fhp->fh_export->ex_devid_map = map;
+ map = NULL;
+
+out_unlock:
+ spin_unlock(&nfsd_devid_lock);
+ kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+ struct nfsd4_deviceid_map *map, *ret = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+ if (map->idx == idx)
+ ret = map;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+ u32 device_generation)
+{
+ if (!fhp->fh_export->ex_devid_map) {
+ nfsd4_alloc_devid_map(fhp);
+ if (!fhp->fh_export->ex_devid_map)
+ return -ENOMEM;
+ }
+
+ id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+ id->generation = device_generation;
+ id->pad = 0;
+ return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+ struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+
+ if (exp->ex_flags & NFSEXP_NOPNFS)
+ return;
+
+ if (sb->s_export_op->get_uuid &&
+ sb->s_export_op->map_blocks &&
+ sb->s_export_op->commit_blocks)
+ exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+ struct nfs4_layout_stateid *ls = layoutstateid(stid);
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+ trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+
+ spin_lock(&clp->cl_lock);
+ list_del_init(&ls->ls_perclnt);
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&fp->fi_lock);
+ list_del_init(&ls->ls_perfile);
+ spin_unlock(&fp->fi_lock);
+
+ vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+ fput(ls->ls_file);
+
+ if (ls->ls_recalled)
+ atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+
+ kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+ struct file_lock *fl;
+ int status;
+
+ fl = locks_alloc_lock();
+ if (!fl)
+ return -ENOMEM;
+ locks_init_lock(fl);
+ fl->fl_lmops = &nfsd4_layouts_lm_ops;
+ fl->fl_flags = FL_LAYOUT;
+ fl->fl_type = F_RDLCK;
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_owner = ls;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = ls->ls_file;
+
+ status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+ if (status) {
+ locks_free_lock(fl);
+ return status;
+ }
+ BUG_ON(fl != NULL);
+ return 0;
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+ struct nfs4_stid *parent, u32 layout_type)
+{
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_file *fp = parent->sc_file;
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_stid *stp;
+
+ stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+ if (!stp)
+ return NULL;
+ stp->sc_free = nfsd4_free_layout_stateid;
+ get_nfs4_file(fp);
+ stp->sc_file = fp;
+
+ ls = layoutstateid(stp);
+ INIT_LIST_HEAD(&ls->ls_perclnt);
+ INIT_LIST_HEAD(&ls->ls_perfile);
+ spin_lock_init(&ls->ls_lock);
+ INIT_LIST_HEAD(&ls->ls_layouts);
+ ls->ls_layout_type = layout_type;
+ nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+ NFSPROC4_CLNT_CB_LAYOUT);
+
+ if (parent->sc_type == NFS4_DELEG_STID)
+ ls->ls_file = get_file(fp->fi_deleg_file);
+ else
+ ls->ls_file = find_any_file(fp);
+ BUG_ON(!ls->ls_file);
+
+ if (nfsd4_layout_setlease(ls)) {
+ put_nfs4_file(fp);
+ kmem_cache_free(nfs4_layout_stateid_cache, ls);
+ return NULL;
+ }
+
+ spin_lock(&clp->cl_lock);
+ stp->sc_type = NFS4_LAYOUT_STID;
+ list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&fp->fi_lock);
+ list_add(&ls->ls_perfile, &fp->fi_lo_states);
+ spin_unlock(&fp->fi_lock);
+
+ trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+ return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_stid *stid;
+ unsigned char typemask = NFS4_LAYOUT_STID;
+ __be32 status;
+
+ if (create)
+ typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+ status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+ net_generic(SVC_NET(rqstp), nfsd_net_id));
+ if (status)
+ goto out;
+
+ if (!fh_match(&cstate->current_fh.fh_handle,
+ &stid->sc_file->fi_fhandle)) {
+ status = nfserr_bad_stateid;
+ goto out_put_stid;
+ }
+
+ if (stid->sc_type != NFS4_LAYOUT_STID) {
+ ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+ nfs4_put_stid(stid);
+
+ status = nfserr_jukebox;
+ if (!ls)
+ goto out;
+ } else {
+ ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+ status = nfserr_bad_stateid;
+ if (stateid->si_generation > stid->sc_stateid.si_generation)
+ goto out_put_stid;
+ if (layout_type != ls->ls_layout_type)
+ goto out_put_stid;
+ }
+
+ *lsp = ls;
+ return 0;
+
+out_put_stid:
+ nfs4_put_stid(stid);
+out:
+ return status;
+}
+
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+ spin_lock(&ls->ls_lock);
+ if (ls->ls_recalled)
+ goto out_unlock;
+
+ ls->ls_recalled = true;
+ atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+ if (list_empty(&ls->ls_layouts))
+ goto out_unlock;
+
+ trace_layout_recall(&ls->ls_stid.sc_stateid);
+
+ atomic_inc(&ls->ls_stid.sc_count);
+ update_stateid(&ls->ls_stid.sc_stateid);
+ memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ nfsd4_run_cb(&ls->ls_recall);
+
+out_unlock:
+ spin_unlock(&ls->ls_lock);
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+ u64 end = seg->offset + seg->length;
+ return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+ if (end == NFS4_MAX_UINT64)
+ lo->length = NFS4_MAX_UINT64;
+ else
+ lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+ if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+ return false;
+ if (layout_end(&lo->lo_seg) <= s->offset)
+ return false;
+ if (layout_end(s) <= lo->lo_seg.offset)
+ return false;
+ return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+ if (lo->iomode != new->iomode)
+ return false;
+ if (layout_end(new) < lo->offset)
+ return false;
+ if (layout_end(lo) < new->offset)
+ return false;
+
+ lo->offset = min(lo->offset, new->offset);
+ layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+ return true;
+}
+
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+ struct nfs4_layout_stateid *l, *n;
+ __be32 nfserr = nfs_ok;
+
+ assert_spin_locked(&fp->fi_lock);
+
+ list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+ if (l != ls) {
+ nfsd4_recall_file_layout(l);
+ nfserr = nfserr_recallconflict;
+ }
+ }
+
+ return nfserr;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+ struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+ struct nfs4_layout *lp, *new = NULL;
+ __be32 nfserr;
+
+ spin_lock(&fp->fi_lock);
+ nfserr = nfsd4_recall_conflict(ls);
+ if (nfserr)
+ goto out;
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+ if (layouts_try_merge(&lp->lo_seg, seg))
+ goto done;
+ }
+ spin_unlock(&ls->ls_lock);
+ spin_unlock(&fp->fi_lock);
+
+ new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+ if (!new)
+ return nfserr_jukebox;
+ memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ new->lo_state = ls;
+
+ spin_lock(&fp->fi_lock);
+ nfserr = nfsd4_recall_conflict(ls);
+ if (nfserr)
+ goto out;
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+ if (layouts_try_merge(&lp->lo_seg, seg))
+ goto done;
+ }
+
+ atomic_inc(&ls->ls_stid.sc_count);
+ list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+ new = NULL;
+done:
+ update_stateid(&ls->ls_stid.sc_stateid);
+ memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ spin_unlock(&ls->ls_lock);
+out:
+ spin_unlock(&fp->fi_lock);
+ if (new)
+ kmem_cache_free(nfs4_layout_cache, new);
+ return nfserr;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+ while (!list_empty(reaplist)) {
+ struct nfs4_layout *lp = list_first_entry(reaplist,
+ struct nfs4_layout, lo_perstate);
+
+ list_del(&lp->lo_perstate);
+ nfs4_put_stid(&lp->lo_state->ls_stid);
+ kmem_cache_free(nfs4_layout_cache, lp);
+ }
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+ struct list_head *reaplist)
+{
+ struct nfsd4_layout_seg *lo = &lp->lo_seg;
+ u64 end = layout_end(lo);
+
+ if (seg->offset <= lo->offset) {
+ if (layout_end(seg) >= end) {
+ list_move_tail(&lp->lo_perstate, reaplist);
+ return;
+ }
+ end = seg->offset;
+ } else {
+ /* retain the whole layout segment on a split. */
+ if (layout_end(seg) < end) {
+ dprintk("%s: split not supported\n", __func__);
+ return;
+ }
+
+ lo->offset = layout_end(seg);
+ }
+
+ layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_layout *lp, *n;
+ LIST_HEAD(reaplist);
+ __be32 nfserr;
+ int found = 0;
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+ false, lrp->lr_layout_type,
+ &ls);
+ if (nfserr) {
+ trace_layout_return_lookup_fail(&lrp->lr_sid);
+ return nfserr;
+ }
+
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+ if (layouts_overlapping(lp, &lrp->lr_seg)) {
+ nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+ found++;
+ }
+ }
+ if (!list_empty(&ls->ls_layouts)) {
+ if (found) {
+ update_stateid(&ls->ls_stid.sc_stateid);
+ memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+ sizeof(stateid_t));
+ }
+ lrp->lrs_present = 1;
+ } else {
+ trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+ nfs4_unhash_stid(&ls->ls_stid);
+ lrp->lrs_present = 0;
+ }
+ spin_unlock(&ls->ls_lock);
+
+ nfs4_put_stid(&ls->ls_stid);
+ nfsd4_free_layouts(&reaplist);
+ return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_layout *lp, *t;
+ LIST_HEAD(reaplist);
+
+ lrp->lrs_present = 0;
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+ if (lrp->lr_return_type == RETURN_FSID &&
+ !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+ &cstate->current_fh.fh_handle))
+ continue;
+
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+ if (lrp->lr_seg.iomode == IOMODE_ANY ||
+ lrp->lr_seg.iomode == lp->lo_seg.iomode)
+ list_move_tail(&lp->lo_perstate, &reaplist);
+ }
+ spin_unlock(&ls->ls_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ nfsd4_free_layouts(&reaplist);
+ return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+ struct list_head *reaplist)
+{
+ spin_lock(&ls->ls_lock);
+ list_splice_init(&ls->ls_layouts, reaplist);
+ spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ LIST_HEAD(reaplist);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+ nfsd4_return_all_layouts(ls, &reaplist);
+ spin_unlock(&clp->cl_lock);
+
+ nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ LIST_HEAD(reaplist);
+
+ spin_lock(&fp->fi_lock);
+ list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+ if (ls->ls_stid.sc_client == clp)
+ nfsd4_return_all_layouts(ls, &reaplist);
+ }
+ spin_unlock(&fp->fi_lock);
+
+ nfsd4_free_layouts(&reaplist);
+}
+
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ char addr_str[INET6_ADDRSTRLEN];
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[8];
+ int error;
+
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+
+ nfsd4_cb_layout_fail(ls);
+
+ printk(KERN_WARNING
+ "nfsd: client %s failed to respond to layout recall. "
+ " Fencing..\n", addr_str);
+
+ argv[0] = "/sbin/nfsd-recall-failed";
+ argv[1] = addr_str;
+ argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+ argv[3] = NULL;
+
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (error) {
+ printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+ addr_str, error);
+ }
+}
+
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ LIST_HEAD(reaplist);
+
+ switch (task->tk_status) {
+ case 0:
+ return 1;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+ task->tk_status = 0;
+ return 1;
+ case -NFS4ERR_DELAY:
+ /* Poll the client until it's done with the layout */
+ /* FIXME: cap number of retries.
+ * The pnfs standard states that we need to only expire
+ * the client after at-least "lease time" .eg lease-time * 2
+ * when failing to communicate a recall
+ */
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ default:
+ /*
+ * Unknown error or non-responding client, we'll need to fence.
+ */
+ nfsd4_cb_layout_fail(ls);
+ return -1;
+ }
+}
+
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ LIST_HEAD(reaplist);
+
+ trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+
+ nfsd4_return_all_layouts(ls, &reaplist);
+ nfsd4_free_layouts(&reaplist);
+ nfs4_put_stid(&ls->ls_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .done = nfsd4_cb_layout_done,
+ .release = nfsd4_cb_layout_release,
+};
+
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+ /*
+ * We don't want the locks code to timeout the lease for us;
+ * we'll remove it ourself if a layout isn't returned
+ * in time:
+ */
+ fl->fl_break_time = 0;
+ nfsd4_recall_file_layout(fl->fl_owner);
+ return false;
+}
+
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+ struct list_head *dispose)
+{
+ BUG_ON(!(arg & F_UNLCK));
+ return lease_modify(onlist, arg, dispose);
+}
+
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+ .lm_break = nfsd4_layout_lm_break,
+ .lm_change = nfsd4_layout_lm_change,
+};
+
+int
+nfsd4_init_pnfs(void)
+{
+ int i;
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+ nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+ sizeof(struct nfs4_layout), 0, 0, NULL);
+ if (!nfs4_layout_cache)
+ return -ENOMEM;
+
+ nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+ sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+ if (!nfs4_layout_stateid_cache) {
+ kmem_cache_destroy(nfs4_layout_cache);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+ int i;
+
+ kmem_cache_destroy(nfs4_layout_cache);
+ kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++) {
+ struct nfsd4_deviceid_map *map, *n;
+
+ list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+ kfree(map);
+ }
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
#include "current_stateid.h"
#include "netns.h"
#include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status == nfserr_same ? nfs_ok : status;
}
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+ if (!exp->ex_layout_type) {
+ dprintk("%s: export does not support pNFS\n", __func__);
+ return NULL;
+ }
+
+ if (exp->ex_layout_type != layout_type) {
+ dprintk("%s: layout type %d not supported\n",
+ __func__, layout_type);
+ return NULL;
+ }
+
+ return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ const struct nfsd4_layout_ops *ops;
+ struct nfsd4_deviceid_map *map;
+ struct svc_export *exp;
+ __be32 nfserr;
+
+ dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+ __func__,
+ gdp->gd_layout_type,
+ gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+ gdp->gd_maxcount);
+
+ map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+ if (!map) {
+ dprintk("%s: couldn't find device ID to export mapping!\n",
+ __func__);
+ return nfserr_noent;
+ }
+
+ exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+ if (IS_ERR(exp)) {
+ dprintk("%s: could not find device id\n", __func__);
+ return nfserr_noent;
+ }
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+ if (!ops)
+ goto out;
+
+ nfserr = nfs_ok;
+ if (gdp->gd_maxcount != 0)
+ nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+ gdp->gd_notify_types &= ops->notify_types;
+ exp_put(exp);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutget *lgp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ const struct nfsd4_layout_ops *ops;
+ struct nfs4_layout_stateid *ls;
+ __be32 nfserr;
+ int accmode;
+
+ switch (lgp->lg_seg.iomode) {
+ case IOMODE_READ:
+ accmode = NFSD_MAY_READ;
+ break;
+ case IOMODE_RW:
+ accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+ break;
+ default:
+ dprintk("%s: invalid iomode %d\n",
+ __func__, lgp->lg_seg.iomode);
+ nfserr = nfserr_badiomode;
+ goto out;
+ }
+
+ nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+ if (!ops)
+ goto out;
+
+ /*
+ * Verify minlength and range as per RFC5661:
+ * o If loga_length is less than loga_minlength,
+ * the metadata server MUST return NFS4ERR_INVAL.
+ * o If the sum of loga_offset and loga_minlength exceeds
+ * NFS4_UINT64_MAX, and loga_minlength is not
+ * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+ * o If the sum of loga_offset and loga_length exceeds
+ * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+ * the error NFS4ERR_INVAL MUST result.
+ */
+ nfserr = nfserr_inval;
+ if (lgp->lg_seg.length < lgp->lg_minlength ||
+ (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+ lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+ (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+ lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+ goto out;
+ if (lgp->lg_seg.length == 0)
+ goto out;
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+ true, lgp->lg_layout_type, &ls);
+ if (nfserr) {
+ trace_layout_get_lookup_fail(&lgp->lg_sid);
+ goto out;
+ }
+
+ nfserr = nfserr_recallconflict;
+ if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+ goto out_put_stid;
+
+ nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+ current_fh, lgp);
+ if (nfserr)
+ goto out_put_stid;
+
+ nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+ nfs4_put_stid(&ls->ls_stid);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutcommit *lcp)
+{
+ const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ const struct nfsd4_layout_ops *ops;
+ loff_t new_size = lcp->lc_last_wr + 1;
+ struct inode *inode;
+ struct nfs4_layout_stateid *ls;
+ __be32 nfserr;
+
+ nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+ if (!ops)
+ goto out;
+ inode = current_fh->fh_dentry->d_inode;
+
+ nfserr = nfserr_inval;
+ if (new_size <= seg->offset) {
+ dprintk("pnfsd: last write before layout segment\n");
+ goto out;
+ }
+ if (new_size > seg->offset + seg->length) {
+ dprintk("pnfsd: last write beyond layout segment\n");
+ goto out;
+ }
+ if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+ dprintk("pnfsd: layoutcommit beyond EOF\n");
+ goto out;
+ }
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+ false, lcp->lc_layout_type,
+ &ls);
+ if (nfserr) {
+ trace_layout_commit_lookup_fail(&lcp->lc_sid);
+ /* fixup error code as per RFC5661 */
+ if (nfserr == nfserr_bad_stateid)
+ nfserr = nfserr_badlayout;
+ goto out;
+ }
+
+ nfserr = ops->proc_layoutcommit(inode, lcp);
+ if (nfserr)
+ goto out_put_stid;
+
+ if (new_size > i_size_read(inode)) {
+ lcp->lc_size_chg = 1;
+ lcp->lc_newsize = new_size;
+ } else {
+ lcp->lc_size_chg = 0;
+ }
+
+out_put_stid:
+ nfs4_put_stid(&ls->ls_stid);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ __be32 nfserr;
+
+ nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+ goto out;
+
+ switch (lrp->lr_seg.iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ case IOMODE_ANY:
+ break;
+ default:
+ dprintk("%s: invalid iomode %d\n", __func__,
+ lrp->lr_seg.iomode);
+ nfserr = nfserr_inval;
+ goto out;
+ }
+
+ switch (lrp->lr_return_type) {
+ case RETURN_FILE:
+ nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+ break;
+ case RETURN_FSID:
+ case RETURN_ALL:
+ nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+ break;
+ default:
+ dprintk("%s: invalid return_type %d\n", __func__,
+ lrp->lr_return_type);
+ nfserr = nfserr_inval;
+ break;
+ }
+out:
+ return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
/*
* NULL call.
*/
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
op_encode_channel_attrs_maxsz) * sizeof(__be32);
}
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE 128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* logr_return_on_close */ +
+ op_encode_stateid_maxsz +
+ 1 /* nr of layouts */ +
+ MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* locr_newsize */ +
+ 2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* lrs_stateid */ +
+ op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = {
+ .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_GETDEVICEINFO",
+ },
+ [OP_LAYOUTGET] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutget,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTGET",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+ },
+ [OP_LAYOUTCOMMIT] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutcommit,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTCOMMIT",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+ },
+ [OP_LAYOUTRETURN] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTRETURN",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+ },
+#endif /* CONFIG_NFSD_PNFS */
/* NFSv4.2 operations */
[OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 532a60cca2fb..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
#include "current_stateid.h"
#include "netns.h"
+#include "pnfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
clp->cl_time = get_seconds();
}
-static inline void
-renew_client(struct nfs4_client *clp)
-{
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
- spin_lock(&nn->client_lock);
- renew_client_locked(clp);
- spin_unlock(&nn->client_lock);
-}
-
static void put_client_renew_locked(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
kmem_cache_free(file_slab, fp);
}
-static inline void
+void
put_nfs4_file(struct nfs4_file *fi)
{
might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
}
}
-static inline void
-get_nfs4_file(struct nfs4_file *fi)
-{
- atomic_inc(&fi->fi_ref);
-}
-
static struct file *
__nfs4_get_fd(struct nfs4_file *f, int oflag)
{
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
return ret;
}
-static struct file *
+struct file *
find_any_file(struct nfs4_file *f)
{
struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
}
-static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
-{
- return fh1->fh_size == fh2->fh_size &&
- !memcmp(fh1->fh_base.fh_pad,
- fh2->fh_base.fh_pad,
- fh1->fh_size);
-}
-
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
__nfs4_file_put_access(fp, O_RDONLY);
}
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab)
{
struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
struct file *filp = NULL;
spin_lock(&fp->fi_lock);
- if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+ if (fp->fi_deleg_file && --fp->fi_delegees == 0)
swap(filp, fp->fi_deleg_file);
spin_unlock(&fp->fi_lock);
if (filp) {
- vfs_setlease(filp, F_UNLCK, NULL, NULL);
+ vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
fput(filp);
}
}
-static void unhash_stid(struct nfs4_stid *s)
+void nfs4_unhash_stid(struct nfs4_stid *s)
{
s->sc_type = 0;
}
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
list_del_init(&stp->st_locks);
unhash_ol_stateid(stp);
- unhash_stid(&stp->st_stid);
+ nfs4_unhash_stid(&stp->st_stid);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
static int
STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
{
- if (clid->cl_boot == nn->boot_time)
+ /*
+ * We're assuming the clid was not given out from a boot
+ * precisely 2^32 (about 136 years) before this one. That seems
+ * a safe assumption:
+ */
+ if (clid->cl_boot == (u32)nn->boot_time)
return 0;
dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
INIT_LIST_HEAD(&clp->cl_lru);
INIT_LIST_HEAD(&clp->cl_callbacks);
INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
spin_lock_init(&clp->cl_lock);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
nfs4_get_stateowner(&oo->oo_owner);
release_openowner(oo);
}
+ nfsd4_return_all_client_layouts(clp);
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
static void
nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
{
- /* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
/* Referrals are supported, Migration is not. */
new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
fp->fi_share_deny = 0;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&fp->fi_lo_states);
+ atomic_set(&fp->fi_lo_recalls, 0);
+#endif
hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
struct nfs4_file *fp;
hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
- if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+ if (fh_match(&fp->fi_fhandle, fh)) {
if (atomic_inc_not_zero(&fp->fi_ref))
return fp;
}
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
return NULL;
}
-static struct nfs4_file *
+struct nfs4_file *
find_file(struct knfsd_fh *fh)
{
struct nfs4_file *fp;
@@ -3856,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
/* Race breaker */
if (fp->fi_deleg_file) {
status = 0;
- atomic_inc(&fp->fi_delegees);
+ ++fp->fi_delegees;
hash_delegation_locked(dp, fp);
goto out_unlock;
}
fp->fi_deleg_file = filp;
- atomic_set(&fp->fi_delegees, 1);
+ fp->fi_delegees = 1;
hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -3902,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
status = -EAGAIN;
goto out_unlock;
}
- atomic_inc(&fp->fi_delegees);
+ ++fp->fi_delegees;
hash_delegation_locked(dp, fp);
status = 0;
out_unlock:
@@ -4295,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
{
- if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+ if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
return nfserr_bad_stateid;
return nfs_ok;
}
@@ -4446,7 +4439,7 @@ out_unlock:
return status;
}
-static __be32
+__be32
nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4860,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+ stp->st_stid.sc_file);
+
nfsd4_close_open_stateid(stp);
/* put reference from nfs4_preprocess_seqid_op */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
#include "state.h"
#include "cache.h"
#include "netns.h"
+#include "pnfs.h"
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
return ret;
}
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+ DECODE_HEAD;
+ u64 sec;
+
+ READ_BUF(12);
+ p = xdr_decode_hyper(p, &sec);
+ tv->tv_sec = sec;
+ tv->tv_nsec = be32_to_cpup(p++);
+ if (tv->tv_nsec >= (u32)1000000000)
+ return nfserr_inval;
+
+ DECODE_TAIL;
+}
+
static __be32
nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
{
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
{
int expected_len, len = 0;
u32 dummy32;
- u64 sec;
char *buf;
DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
- /* We require the high 32 bits of 'seconds' to be 0, and we ignore
- all 32 bits of 'nseconds'. */
- READ_BUF(12);
len += 12;
- p = xdr_decode_hyper(p, &sec);
- iattr->ia_atime.tv_sec = (time_t)sec;
- iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
- if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
- return nfserr_inval;
+ status = nfsd4_decode_time(argp, &iattr->ia_atime);
+ if (status)
+ return status;
iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
break;
case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
- /* We require the high 32 bits of 'seconds' to be 0, and we ignore
- all 32 bits of 'nseconds'. */
- READ_BUF(12);
len += 12;
- p = xdr_decode_hyper(p, &sec);
- iattr->ia_mtime.tv_sec = sec;
- iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
- if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
- return nfserr_inval;
+ status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+ if (status)
+ return status;
iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
break;
case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
DECODE_TAIL;
}
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+ struct nfsd4_getdeviceinfo *gdev)
+{
+ DECODE_HEAD;
+ u32 num, i;
+
+ READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+ COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+ gdev->gd_layout_type = be32_to_cpup(p++);
+ gdev->gd_maxcount = be32_to_cpup(p++);
+ num = be32_to_cpup(p++);
+ if (num) {
+ READ_BUF(4 * num);
+ gdev->gd_notify_types = be32_to_cpup(p++);
+ for (i = 1; i < num; i++) {
+ if (be32_to_cpup(p++)) {
+ status = nfserr_inval;
+ goto out;
+ }
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutget *lgp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(36);
+ lgp->lg_signal = be32_to_cpup(p++);
+ lgp->lg_layout_type = be32_to_cpup(p++);
+ lgp->lg_seg.iomode = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+ p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+ p = xdr_decode_hyper(p, &lgp->lg_minlength);
+ nfsd4_decode_stateid(argp, &lgp->lg_sid);
+ READ_BUF(4);
+ lgp->lg_maxcount = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutcommit *lcp)
+{
+ DECODE_HEAD;
+ u32 timechange;
+
+ READ_BUF(20);
+ p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+ p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+ lcp->lc_reclaim = be32_to_cpup(p++);
+ nfsd4_decode_stateid(argp, &lcp->lc_sid);
+ READ_BUF(4);
+ lcp->lc_newoffset = be32_to_cpup(p++);
+ if (lcp->lc_newoffset) {
+ READ_BUF(8);
+ p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+ } else
+ lcp->lc_last_wr = 0;
+ READ_BUF(4);
+ timechange = be32_to_cpup(p++);
+ if (timechange) {
+ status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+ if (status)
+ return status;
+ } else {
+ lcp->lc_mtime.tv_nsec = UTIME_NOW;
+ }
+ READ_BUF(8);
+ lcp->lc_layout_type = be32_to_cpup(p++);
+
+ /*
+ * Save the layout update in XDR format and let the layout driver deal
+ * with it later.
+ */
+ lcp->lc_up_len = be32_to_cpup(p++);
+ if (lcp->lc_up_len > 0) {
+ READ_BUF(lcp->lc_up_len);
+ READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutreturn *lrp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(16);
+ lrp->lr_reclaim = be32_to_cpup(p++);
+ lrp->lr_layout_type = be32_to_cpup(p++);
+ lrp->lr_seg.iomode = be32_to_cpup(p++);
+ lrp->lr_return_type = be32_to_cpup(p++);
+ if (lrp->lr_return_type == RETURN_FILE) {
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+ p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+ nfsd4_decode_stateid(argp, &lrp->lr_sid);
+ READ_BUF(4);
+ lrp->lrf_body_len = be32_to_cpup(p++);
+ if (lrp->lrf_body_len > 0) {
+ READ_BUF(lrp->lrf_body_len);
+ READMEM(lrp->lrf_body, lrp->lrf_body_len);
+ }
+ } else {
+ lrp->lr_seg.offset = 0;
+ lrp->lr_seg.length = NFS4_MAX_UINT64;
+ }
+
+ DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
static __be32
nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
[OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
[OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
[OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
[OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
[OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
get_parent_attributes(exp, &stat);
p = xdr_encode_hyper(p, stat.ino);
}
+#ifdef CONFIG_NFSD_PNFS
+ if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+ (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+ if (exp->ex_layout_type) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(exp->ex_layout_type);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(0);
+ }
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(stat.blksize);
+ }
+#endif /* CONFIG_NFSD_PNFS */
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
status = nfsd4_encode_security_label(xdr, rqstp, context,
contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
if (entry_bytes > cd->rd_maxcount)
goto fail;
cd->rd_maxcount -= entry_bytes;
- if (!cd->rd_dircount)
- goto fail;
/*
* RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
* let's always let through the first entry, at least:
*/
- name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+ if (!cd->rd_dircount)
+ goto fail;
+ name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
goto fail;
cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+
cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr;
}
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_getdeviceinfo *gdev)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ const struct nfsd4_layout_ops *ops =
+ nfsd4_layout_ops[gdev->gd_layout_type];
+ u32 starting_len = xdr->buf->len, needed_len;
+ __be32 *p;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_resource;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
+
+ *p++ = cpu_to_be32(gdev->gd_layout_type);
+
+ /* If maxcount is 0 then just update notifications */
+ if (gdev->gd_maxcount != 0) {
+ nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+ if (nfserr) {
+ /*
+ * We don't bother to burden the layout drivers with
+ * enforcing gd_maxcount, just tell the client to
+ * come back with a bigger buffer if it's not enough.
+ */
+ if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ goto toosmall;
+ goto out;
+ }
+ }
+
+ nfserr = nfserr_resource;
+ if (gdev->gd_notify_types) {
+ p = xdr_reserve_space(xdr, 4 + 4);
+ if (!p)
+ goto out;
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(gdev->gd_notify_types);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
+ *p++ = 0;
+ }
+
+ nfserr = 0;
+out:
+ kfree(gdev->gd_device);
+ dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+ return nfserr;
+
+toosmall:
+ dprintk("%s: maxcount too small\n", __func__);
+ needed_len = xdr->buf->len + 4 /* notifications */;
+ xdr_truncate_encode(xdr, starting_len);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p) {
+ nfserr = nfserr_resource;
+ } else {
+ *p++ = cpu_to_be32(needed_len);
+ nfserr = nfserr_toosmall;
+ }
+ goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ const struct nfsd4_layout_ops *ops =
+ nfsd4_layout_ops[lgp->lg_layout_type];
+ __be32 *p;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_resource;
+ p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+ if (!p)
+ goto out;
+
+ *p++ = cpu_to_be32(1); /* we always set return-on-close */
+ *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+ p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+ sizeof(stateid_opaque_t));
+
+ *p++ = cpu_to_be32(1); /* we always return a single layout */
+ p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+ p = xdr_encode_hyper(p, lgp->lg_seg.length);
+ *p++ = cpu_to_be32(lgp->lg_seg.iomode);
+ *p++ = cpu_to_be32(lgp->lg_layout_type);
+
+ nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+ kfree(lgp->lg_content);
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(lcp->lc_size_chg);
+ if (lcp->lc_size_chg) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_hyper(p, lcp->lc_newsize);
+ }
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(lrp->lrs_present);
+ if (lrp->lrs_present)
+ nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+ return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
[OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+#endif
[OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
[OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
#include "cache.h"
#include "state.h"
#include "netns.h"
+#include "pnfs.h"
/*
* We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
retval = nfsd4_init_slabs();
if (retval)
goto out_unregister_pernet;
- retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+ retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
+ retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+ if (retval)
+ goto out_exit_pnfs;
nfsd_stat_init(); /* Statistics */
retval = nfsd_reply_cache_init();
if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
out_free_stat:
nfsd_stat_shutdown();
nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+ nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
nfsd_stat_shutdown();
nfsd_lockd_shutdown();
nfsd4_free_slabs();
+ nfsd4_exit_pnfs();
nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1 0
+#define PNFSD_SUPPORTED_ATTRS_WORD2 0
+#endif /* CONFIG_NFSD_PNFS */
+
#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
NFSD4_SUPPORTED_ATTRS_WORD0
#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
- NFSD4_SUPPORTED_ATTRS_WORD1
+ (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1)
#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+ (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \
+ FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+/* 4.2 */
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
#else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..84cae2079d21 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
return fhp;
}
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+ if (fh1->fh_size != fh2->fh_size)
+ return false;
+ if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ return false;
+ return true;
+}
+
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+ if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+ return false;
+ if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0))
+ return false;
+ return true;
+}
+
#ifdef CONFIG_NFSD_V3
/*
* The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program nfsd_program = {
static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
[0] = 1,
[1] = 1,
+ [2] = 1,
};
int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..fedb4d620a81
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,81 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+ struct list_head hash;
+ u64 idx;
+ int fsid_type;
+ u32 fsid[];
+};
+
+struct nfsd4_layout_ops {
+ u32 notify_types;
+
+ __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdevp);
+ __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdevp);
+
+ __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *lgp);
+ __be32 (*encode_layoutget)(struct xdr_stream *,
+ struct nfsd4_layoutget *lgp);
+
+ __be32 (*proc_layoutcommit)(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+ struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+ u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+ struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+ struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+ return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define NFS4_REVOKED_DELEG_STID 16
#define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
unsigned char sc_type;
stateid_t sc_stateid;
struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
struct list_head cl_delegations;
struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
struct list_head cl_lru; /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+ struct list_head cl_lo_states; /* outstanding layout states */
+#endif
struct xdr_netobj cl_name; /* id generated by client */
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
atomic_t fi_access[2];
u32 fi_share_deny;
struct file *fi_deleg_file;
- atomic_t fi_delegees;
+ int fi_delegees;
struct knfsd_fh fi_fhandle;
bool fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+ struct list_head fi_lo_states;
+ atomic_t fi_lo_recalls;
+#endif
};
/*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
return container_of(s, struct nfs4_ol_stateid, st_stid);
}
+struct nfs4_layout_stateid {
+ struct nfs4_stid ls_stid;
+ struct list_head ls_perclnt;
+ struct list_head ls_perfile;
+ spinlock_t ls_lock;
+ struct list_head ls_layouts;
+ u32 ls_layout_type;
+ struct file *ls_file;
+ struct nfsd4_callback ls_recall;
+ stateid_t ls_recall_sid;
+ bool ls_recalled;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
/* flags for preprocess_seqid_op() */
#define RD_STATE 0x00000010
#define WR_STATE 0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_LAYOUT,
NFSPROC4_CLNT_CB_SEQUENCE,
};
@@ -545,6 +572,12 @@ struct nfsd_net;
extern __be32 nfs4_preprocess_stateid_op(struct net *net,
struct nfsd4_compound_state *cstate,
stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+ struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+ atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
+
/* grace period management */
void nfsd4_end_grace(struct nfsd_net *nn);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+
+#include "state.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+ TP_PROTO(stateid_t *stp),
+ TP_ARGS(stp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x",
+ __entry->cl_boot,
+ __entry->cl_id,
+ __entry->si_id,
+ __entry->si_generation)
+)
+
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+ TP_PROTO(stateid_t *stp), \
+ TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+
+#endif /* _NFSD_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
+struct nfsd4_deviceid {
+ u64 fsid_idx;
+ u32 generation;
+ u32 pad;
+};
+
+struct nfsd4_layout_seg {
+ u32 iomode;
+ u64 offset;
+ u64 length;
+};
+
+struct nfsd4_getdeviceinfo {
+ struct nfsd4_deviceid gd_devid; /* request */
+ u32 gd_layout_type; /* request */
+ u32 gd_maxcount; /* request */
+ u32 gd_notify_types;/* request - response */
+ void *gd_device; /* response */
+};
+
+struct nfsd4_layoutget {
+ u64 lg_minlength; /* request */
+ u32 lg_signal; /* request */
+ u32 lg_layout_type; /* request */
+ u32 lg_maxcount; /* request */
+ stateid_t lg_sid; /* request/response */
+ struct nfsd4_layout_seg lg_seg; /* request/response */
+ void *lg_content; /* response */
+};
+
+struct nfsd4_layoutcommit {
+ stateid_t lc_sid; /* request */
+ struct nfsd4_layout_seg lc_seg; /* request */
+ u32 lc_reclaim; /* request */
+ u32 lc_newoffset; /* request */
+ u64 lc_last_wr; /* request */
+ struct timespec lc_mtime; /* request */
+ u32 lc_layout_type; /* request */
+ u32 lc_up_len; /* layout length */
+ void *lc_up_layout; /* decoded by callback */
+ u32 lc_size_chg; /* boolean for response */
+ u64 lc_newsize; /* response */
+};
+
+struct nfsd4_layoutreturn {
+ u32 lr_return_type; /* request */
+ u32 lr_layout_type; /* request */
+ struct nfsd4_layout_seg lr_seg; /* request */
+ u32 lr_reclaim; /* request */
+ u32 lrf_body_len; /* request */
+ void *lrf_body; /* request */
+ stateid_t lr_sid; /* request/response */
+ u32 lrs_present; /* response */
+};
+
struct nfsd4_fallocate {
/* request */
stateid_t falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
struct nfsd4_reclaim_complete reclaim_complete;
struct nfsd4_test_stateid test_stateid;
struct nfsd4_free_stateid free_stateid;
+ struct nfsd4_getdeviceinfo getdeviceinfo;
+ struct nfsd4_layoutget layoutget;
+ struct nfsd4_layoutcommit layoutcommit;
+ struct nfsd4_layoutreturn layoutreturn;
/* NFSv4.2 */
struct nfsd4_fallocate allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 3 + \
+ enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 41b223a59a63..fa05e04c5531 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,6 +4,7 @@
#include <linux/types.h>
struct dentry;
+struct iattr;
struct inode;
struct super_block;
struct vfsmount;
@@ -180,6 +181,21 @@ struct fid {
* get_name is not (which is possibly inconsistent)
*/
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
+#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
+
+struct iomap {
+ sector_t blkno; /* first sector of mapping */
+ loff_t offset; /* file offset of mapping, bytes */
+ u64 length; /* length of mapping, bytes */
+ int type; /* type of mapping */
+};
+
struct export_operations {
int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
struct inode *parent);
@@ -191,6 +207,13 @@ struct export_operations {
struct dentry *child);
struct dentry * (*get_parent)(struct dentry *child);
int (*commit_metadata)(struct inode *inode);
+
+ int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+ int (*map_blocks)(struct inode *inode, loff_t offset,
+ u64 len, struct iomap *iomap,
+ bool write, u32 *device_generation);
+ int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
+ int nr_iomaps, struct iattr *iattr);
};
extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f125b88443bd..cdcb1e9d9613 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -873,6 +873,7 @@ static inline struct file *get_file(struct file *f)
#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING 512 /* Lease is being broken */
#define FL_OFDLCK 1024 /* lock is "owned" by struct file */
+#define FL_LAYOUT 2048 /* outstanding pNFS layout */
/*
* Special return value from posix_lock_file() and vfs_lock_file() for
@@ -2035,6 +2036,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
return ret;
}
+static inline int break_layout(struct inode *inode, bool wait)
+{
+ smp_mb();
+ if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+ return __break_lease(inode,
+ wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
+ FL_LAYOUT);
+ return 0;
+}
+
#else /* !CONFIG_FILE_LOCKING */
static inline int locks_mandatory_locked(struct file *file)
{
@@ -2090,6 +2101,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
return 0;
}
+static inline int break_layout(struct inode *inode, bool wait)
+{
+ return 0;
+}
+
#endif /* CONFIG_FILE_LOCKING */
/* fs/open.c */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index de7c91ca427e..ed43cb74b11d 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -411,6 +411,7 @@ enum lock_type4 {
#define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22)
#define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23)
#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30)
+#define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0)
#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
#define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4)
#define FATTR4_WORD2_SECURITY_LABEL (1UL << 16)
@@ -517,6 +518,7 @@ enum pnfs_layouttype {
LAYOUT_OSD2_OBJECTS = 2,
LAYOUT_BLOCK_VOLUME = 3,
LAYOUT_FLEX_FILES = 4,
+ LAYOUT_TYPE_MAX
};
/* used for both layout return and recall */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6f22cfeef5e3..fae6fb947fc8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -110,7 +110,7 @@ struct svc_serv {
* We use sv_nrthreads as a reference count. svc_destroy() drops
* this refcount, so we need to bump it up around operations that
* change the number of threads. Horrible, but there it is.
- * Should be called with the BKL held.
+ * Should be called with the "service mutex" held.
*/
static inline void svc_get(struct svc_serv *serv)
{
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ddfe88f52219..df8edf8ec914 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -77,6 +77,7 @@ struct svc_rdma_op_ctxt {
enum ib_wr_opcode wr_op;
enum ib_wc_status wc_status;
u32 byte_len;
+ u32 position;
struct svcxprt_rdma *xprt;
unsigned long flags;
enum dma_data_direction direction;
@@ -148,6 +149,10 @@ struct svcxprt_rdma {
struct ib_cq *sc_rq_cq;
struct ib_cq *sc_sq_cq;
struct ib_mr *sc_phys_mr; /* MR for server memory */
+ int (*sc_reader)(struct svcxprt_rdma *,
+ struct svc_rqst *,
+ struct svc_rdma_op_ctxt *,
+ int *, u32 *, u32, u32, u64, bool);
u32 sc_dev_caps; /* distilled device caps */
u32 sc_dma_lkey; /* local dma key */
unsigned int sc_frmr_pg_list_len;
@@ -176,8 +181,6 @@ struct svcxprt_rdma {
#define RPCRDMA_MAX_REQ_SIZE 4096
/* svc_rdma_marshal.c */
-extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
- int *, int *);
extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
@@ -195,6 +198,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
/* svc_rdma_recvfrom.c */
extern int svc_rdma_recvfrom(struct svc_rqst *);
+extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *,
+ struct svc_rdma_op_ctxt *, int *, u32 *,
+ u32, u32, u64, bool);
+extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
+ struct svc_rdma_op_ctxt *, int *, u32 *,
+ u32, u32, u64, bool);
/* svc_rdma_sendto.c */
extern int svc_rdma_sendto(struct svc_rqst *);
diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h
index 1fdc95bb2375..0bf130a1c58d 100644
--- a/include/uapi/linux/nfsd/debug.h
+++ b/include/uapi/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
#define NFSDDBG_REPCACHE 0x0080
#define NFSDDBG_XDR 0x0100
#define NFSDDBG_LOCKD 0x0200
+#define NFSDDBG_PNFS 0x0400
#define NFSDDBG_ALL 0x7FFF
#define NFSDDBG_NOCHANGE 0xFFFF
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 584b6ef3a5e8..4742f2cb42f2 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,8 +47,10 @@
* exported filesystem.
*/
#define NFSEXP_V4ROOT 0x10000
+#define NFSEXP_NOPNFS 0x20000
+
/* All flags that we claim to support. (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS 0x1FE7F
+#define NFSEXP_ALLFLAGS 0x3FE7F
/* The flags that may vary depending on security flavor: */
#define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 91eaef1844c8..78974e4d9ad2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
EXPORT_SYMBOL_GPL(svc_set_num_threads);
/*
- * Called from a server thread as it's exiting. Caller must hold the BKL or
- * the "service mutex", whichever is appropriate for the service.
+ * Called from a server thread as it's exiting. Caller must hold the "service
+ * mutex" for the service.
*/
void
svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c69358b3cf7f..163ac45c3639 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list);
* svc_pool->sp_lock protects most of the fields of that pool.
* svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
* when both need to be taken (rare), svc_serv->sv_lock is first.
- * BKL protects svc_serv->sv_nrthread.
+ * The "service mutex" protects svc_serv->sv_nrthread.
* svc_sock->sk_lock protects the svc_sock->sk_deferred list
* and the ->sk_info_authunix cache.
*
@@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list);
* that no other thread will be using the transport or will
* try to set XPT_DEAD.
*/
-
int svc_reg_xprt_class(struct svc_xprt_class *xcl)
{
struct svc_xprt_class *cl;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 65b146297f5a..b681855cf970 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -71,22 +71,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
}
/*
- * Determine number of chunks and total bytes in chunk list. The chunk
- * list has already been verified to fit within the RPCRDMA header.
- */
-void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
- int *ch_count, int *byte_count)
-{
- /* compute the number of bytes represented by read chunks */
- *byte_count = 0;
- *ch_count = 0;
- for (; ch->rc_discrim != 0; ch++) {
- *byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
- *ch_count = *ch_count + 1;
- }
-}
-
-/*
* Decodes a write chunk list. The expected format is as follows:
* descrim : xdr_one
* nchunks : <count>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e0110270d650..f9f13a32ddb8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -43,7 +43,6 @@
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/spinlock.h>
-#include <linux/highmem.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
@@ -60,6 +59,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *ctxt,
u32 byte_count)
{
+ struct rpcrdma_msg *rmsgp;
struct page *page;
u32 bc;
int sge_no;
@@ -82,7 +82,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
/* If data remains, store it in the pagelist */
rqstp->rq_arg.page_len = bc;
rqstp->rq_arg.page_base = 0;
- rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
+ /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+ if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+ rqstp->rq_arg.pages = &rqstp->rq_pages[0];
+ else
+ rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
sge_no = 1;
while (bc && sge_no < ctxt->count) {
page = ctxt->pages[sge_no];
@@ -95,14 +102,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_respages = &rqstp->rq_pages[sge_no];
rqstp->rq_next_page = rqstp->rq_respages + 1;
- /* We should never run out of SGE because the limit is defined to
- * support the max allowed RPC data length
- */
- BUG_ON(bc && (sge_no == ctxt->count));
- BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
- != byte_count);
- BUG_ON(rqstp->rq_arg.len != byte_count);
-
/* If not all pages were used from the SGL, free the remaining ones */
bc = sge_no;
while (sge_no < ctxt->count) {
@@ -125,26 +124,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
return min_t(int, sge_count, xprt->sc_max_sge);
}
-typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
- struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head,
- int *page_no,
- u32 *page_offset,
- u32 rs_handle,
- u32 rs_length,
- u64 rs_offset,
- int last);
-
/* Issue an RDMA_READ using the local lkey to map the data sink */
-static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
- struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head,
- int *page_no,
- u32 *page_offset,
- u32 rs_handle,
- u32 rs_length,
- u64 rs_offset,
- int last)
+int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ bool last)
{
struct ib_send_wr read_wr;
int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
@@ -229,15 +218,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
}
/* Issue an RDMA_READ using an FRMR to map the data sink */
-static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
- struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head,
- int *page_no,
- u32 *page_offset,
- u32 rs_handle,
- u32 rs_length,
- u64 rs_offset,
- int last)
+int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ bool last)
{
struct ib_send_wr read_wr;
struct ib_send_wr inv_wr;
@@ -365,24 +354,84 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
return ret;
}
+static unsigned int
+rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
+{
+ unsigned int count;
+
+ for (count = 0; ch->rc_discrim != xdr_zero; ch++)
+ count++;
+ return count;
+}
+
+/* If there was additional inline content, append it to the end of arg.pages.
+ * Tail copy has to be done after the reader function has determined how many
+ * pages are needed for RDMA READ.
+ */
+static int
+rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
+ u32 position, u32 byte_count, u32 page_offset, int page_no)
+{
+ char *srcp, *destp;
+ int ret;
+
+ ret = 0;
+ srcp = head->arg.head[0].iov_base + position;
+ byte_count = head->arg.head[0].iov_len - position;
+ if (byte_count > PAGE_SIZE) {
+ dprintk("svcrdma: large tail unsupported\n");
+ return 0;
+ }
+
+ /* Fit as much of the tail on the current page as possible */
+ if (page_offset != PAGE_SIZE) {
+ destp = page_address(rqstp->rq_arg.pages[page_no]);
+ destp += page_offset;
+ while (byte_count--) {
+ *destp++ = *srcp++;
+ page_offset++;
+ if (page_offset == PAGE_SIZE && byte_count)
+ goto more;
+ }
+ goto done;
+ }
+
+more:
+ /* Fit the rest on the next page */
+ page_no++;
+ destp = page_address(rqstp->rq_arg.pages[page_no]);
+ while (byte_count--)
+ *destp++ = *srcp++;
+
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+done:
+ byte_count = head->arg.head[0].iov_len - position;
+ head->arg.page_len += byte_count;
+ head->arg.len += byte_count;
+ head->arg.buflen += byte_count;
+ return 1;
+}
+
static int rdma_read_chunks(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head)
{
- int page_no, ch_count, ret;
+ int page_no, ret;
struct rpcrdma_read_chunk *ch;
- u32 page_offset, byte_count;
+ u32 handle, page_offset, byte_count;
+ u32 position;
u64 rs_offset;
- rdma_reader_fn reader;
+ bool last;
/* If no read list is present, return 0 */
ch = svc_rdma_get_read_chunk(rmsgp);
if (!ch)
return 0;
- svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
- if (ch_count > RPCSVC_MAXPAGES)
+ if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
return -EINVAL;
/* The request is completed when the RDMA_READs complete. The
@@ -391,34 +440,41 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
*/
head->arg.head[0] = rqstp->rq_arg.head[0];
head->arg.tail[0] = rqstp->rq_arg.tail[0];
- head->arg.pages = &head->pages[head->count];
head->hdr_count = head->count;
head->arg.page_base = 0;
head->arg.page_len = 0;
head->arg.len = rqstp->rq_arg.len;
head->arg.buflen = rqstp->rq_arg.buflen;
- /* Use FRMR if supported */
- if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
- reader = rdma_read_chunk_frmr;
- else
- reader = rdma_read_chunk_lcl;
+ ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ position = be32_to_cpu(ch->rc_position);
+
+ /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ if (position == 0) {
+ head->arg.pages = &head->pages[0];
+ page_offset = head->byte_len;
+ } else {
+ head->arg.pages = &head->pages[head->count];
+ page_offset = 0;
+ }
- page_no = 0; page_offset = 0;
- for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- ch->rc_discrim != 0; ch++) {
+ ret = 0;
+ page_no = 0;
+ for (; ch->rc_discrim != xdr_zero; ch++) {
+ if (be32_to_cpu(ch->rc_position) != position)
+ goto err;
+ handle = be32_to_cpu(ch->rc_target.rs_handle),
+ byte_count = be32_to_cpu(ch->rc_target.rs_length);
xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
&rs_offset);
- byte_count = ntohl(ch->rc_target.rs_length);
while (byte_count > 0) {
- ret = reader(xprt, rqstp, head,
- &page_no, &page_offset,
- ntohl(ch->rc_target.rs_handle),
- byte_count, rs_offset,
- ((ch+1)->rc_discrim == 0) /* last */
- );
+ last = (ch + 1)->rc_discrim == xdr_zero;
+ ret = xprt->sc_reader(xprt, rqstp, head,
+ &page_no, &page_offset,
+ handle, byte_count,
+ rs_offset, last);
if (ret < 0)
goto err;
byte_count -= ret;
@@ -426,7 +482,24 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
head->arg.buflen += ret;
}
}
+
+ /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */
+ if (page_offset & 3) {
+ u32 pad = 4 - (page_offset & 3);
+
+ head->arg.page_len += pad;
+ head->arg.len += pad;
+ head->arg.buflen += pad;
+ page_offset += pad;
+ }
+
ret = 1;
+ if (position && position < head->arg.head[0].iov_len)
+ ret = rdma_copy_tail(rqstp, head, position,
+ byte_count, page_offset, page_no);
+ head->arg.head[0].iov_len = position;
+ head->position = position;
+
err:
/* Detach arg pages. svc_recv will replenish them */
for (page_no = 0;
@@ -436,47 +509,33 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
return ret;
}
-/*
- * To avoid a separate RDMA READ just for a handful of zero bytes,
- * RFC 5666 section 3.7 allows the client to omit the XDR zero pad
- * in chunk lists.
- */
-static void
-rdma_fix_xdr_pad(struct xdr_buf *buf)
-{
- unsigned int page_len = buf->page_len;
- unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len;
- unsigned int offset, pg_no;
- char *p;
-
- if (size == 0)
- return;
-
- pg_no = page_len >> PAGE_SHIFT;
- offset = page_len & ~PAGE_MASK;
- p = page_address(buf->pages[pg_no]);
- memset(p + offset, 0, size);
-
- buf->page_len += size;
- buf->buflen += size;
- buf->len += size;
-}
-
static int rdma_read_complete(struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head)
{
int page_no;
int ret;
- BUG_ON(!head);
-
/* Copy RPC pages */
for (page_no = 0; page_no < head->count; page_no++) {
put_page(rqstp->rq_pages[page_no]);
rqstp->rq_pages[page_no] = head->pages[page_no];
}
+
+ /* Adjustments made for RDMA_NOMSG type requests */
+ if (head->position == 0) {
+ if (head->arg.len <= head->sge[0].length) {
+ head->arg.head[0].iov_len = head->arg.len -
+ head->byte_len;
+ head->arg.page_len = 0;
+ } else {
+ head->arg.head[0].iov_len = head->sge[0].length -
+ head->byte_len;
+ head->arg.page_len = head->arg.len -
+ head->sge[0].length;
+ }
+ }
+
/* Point rq_arg.pages past header */
- rdma_fix_xdr_pad(&head->arg);
rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
rqstp->rq_arg.page_len = head->arg.page_len;
rqstp->rq_arg.page_base = head->arg.page_base;
@@ -501,8 +560,8 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
ret = rqstp->rq_arg.head[0].iov_len
+ rqstp->rq_arg.page_len
+ rqstp->rq_arg.tail[0].iov_len;
- dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
- "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+ dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
+ "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
rqstp->rq_arg.head[0].iov_len);
@@ -558,7 +617,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
}
dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
ctxt, rdma_xprt, rqstp, ctxt->wc_status);
- BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
atomic_inc(&rdma_stat_recv);
/* Build up the XDR from the receive buffers. */
@@ -591,8 +649,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+ rqstp->rq_arg.tail[0].iov_len;
svc_rdma_put_context(ctxt, 0);
out:
- dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
- "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+ dprintk("svcrdma: ret=%d, rq_arg.len=%u, "
+ "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n",
ret, rqstp->rq_arg.len,
rqstp->rq_arg.head[0].iov_base,
rqstp->rq_arg.head[0].iov_len);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 9f1b50689c0f..7de33d1af9b6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -60,8 +60,11 @@ static int map_xdr(struct svcxprt_rdma *xprt,
u32 page_off;
int page_no;
- BUG_ON(xdr->len !=
- (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+ if (xdr->len !=
+ (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
+ pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+ return -EIO;
+ }
/* Skip the first sge, this is for the RPCRDMA header */
sge_no = 1;
@@ -150,7 +153,11 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
int bc;
struct svc_rdma_op_ctxt *ctxt;
- BUG_ON(vec->count > RPCSVC_MAXPAGES);
+ if (vec->count > RPCSVC_MAXPAGES) {
+ pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+ return -EIO;
+ }
+
dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
"write_len=%d, vec->sge=%p, vec->count=%lu\n",
rmr, (unsigned long long)to, xdr_off,
@@ -190,7 +197,10 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge_off = 0;
sge_no++;
xdr_sge_no++;
- BUG_ON(xdr_sge_no > vec->count);
+ if (xdr_sge_no > vec->count) {
+ pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
+ goto err;
+ }
bc -= sge_bytes;
if (sge_no == xprt->sc_max_sge)
break;
@@ -421,7 +431,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
- BUG_ON(byte_count != 0);
+ if (byte_count != 0) {
+ pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+ goto err;
+ }
/* Save all respages in the ctxt and remove them from the
* respages array. They are our pages until the I/O
@@ -442,7 +455,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
}
rqstp->rq_next_page = rqstp->rq_respages + 1;
- BUG_ON(sge_no > rdma->sc_max_sge);
+ if (sge_no > rdma->sc_max_sge) {
+ pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ goto err;
+ }
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
send_wr.wr_id = (unsigned long)ctxt;
@@ -467,18 +483,6 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
{
}
-/*
- * Return the start of an xdr buffer.
- */
-static void *xdr_start(struct xdr_buf *xdr)
-{
- return xdr->head[0].iov_base -
- (xdr->len -
- xdr->page_len -
- xdr->tail[0].iov_len -
- xdr->head[0].iov_len);
-}
-
int svc_rdma_sendto(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = rqstp->rq_xprt;
@@ -496,8 +500,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
- /* Get the RDMA request header. */
- rdma_argp = xdr_start(&rqstp->rq_arg);
+ /* Get the RDMA request header. The receive logic always
+ * places this at the start of page 0.
+ */
+ rdma_argp = page_address(rqstp->rq_pages[0]);
/* Build an req vec for the XDR */
ctxt = svc_rdma_get_context(rdma);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4e618808bc98..f609c1c2d38d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -139,7 +139,6 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
struct svcxprt_rdma *xprt;
int i;
- BUG_ON(!ctxt);
xprt = ctxt->xprt;
if (free_pages)
for (i = 0; i < ctxt->count; i++)
@@ -339,12 +338,14 @@ static void process_context(struct svcxprt_rdma *xprt,
switch (ctxt->wr_op) {
case IB_WR_SEND:
- BUG_ON(ctxt->frmr);
+ if (ctxt->frmr)
+ pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
svc_rdma_put_context(ctxt, 1);
break;
case IB_WR_RDMA_WRITE:
- BUG_ON(ctxt->frmr);
+ if (ctxt->frmr)
+ pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
svc_rdma_put_context(ctxt, 0);
break;
@@ -353,19 +354,21 @@ static void process_context(struct svcxprt_rdma *xprt,
svc_rdma_put_frmr(xprt, ctxt->frmr);
if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
- BUG_ON(!read_hdr);
- spin_lock_bh(&xprt->sc_rq_dto_lock);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- list_add_tail(&read_hdr->dto_q,
- &xprt->sc_read_complete_q);
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ if (read_hdr) {
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ } else {
+ pr_err("svcrdma: ctxt->read_hdr == NULL\n");
+ }
svc_xprt_enqueue(&xprt->sc_xprt);
}
svc_rdma_put_context(ctxt, 0);
break;
default:
- BUG_ON(1);
printk(KERN_ERR "svcrdma: unexpected completion type, "
"opcode=%d\n",
ctxt->wr_op);
@@ -513,7 +516,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
buflen = 0;
ctxt->direction = DMA_FROM_DEVICE;
for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
- BUG_ON(sge_no >= xprt->sc_max_sge);
+ if (sge_no >= xprt->sc_max_sge) {
+ pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ goto err_put_ctxt;
+ }
page = svc_rdma_get_page();
ctxt->pages[sge_no] = page;
pa = ib_dma_map_page(xprt->sc_cm_id->device,
@@ -687,7 +693,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
{
struct rdma_cm_id *listen_id;
struct svcxprt_rdma *cma_xprt;
- struct svc_xprt *xprt;
int ret;
dprintk("svcrdma: Creating RDMA socket\n");
@@ -698,7 +703,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
cma_xprt = rdma_create_xprt(serv, 1);
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
- xprt = &cma_xprt->sc_xprt;
listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
IB_QPT_RC);
@@ -822,7 +826,7 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
if (frmr) {
frmr_unmap_dma(rdma, frmr);
spin_lock_bh(&rdma->sc_frmr_q_lock);
- BUG_ON(!list_empty(&frmr->frmr_list));
+ WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
spin_unlock_bh(&rdma->sc_frmr_q_lock);
}
@@ -970,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
* NB: iWARP requires remote write access for the data sink
* of an RDMA_READ. IB does not.
*/
+ newxprt->sc_reader = rdma_read_chunk_lcl;
if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
newxprt->sc_frmr_pg_list_len =
devattr.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+ newxprt->sc_reader = rdma_read_chunk_frmr;
}
/*
@@ -1125,7 +1131,9 @@ static void __svc_rdma_free(struct work_struct *work)
dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
/* We should only be called from kref_put */
- BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
+ if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+ pr_err("svcrdma: sc_xprt still in use? (%d)\n",
+ atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
/*
* Destroy queued, but not processed read completions. Note
@@ -1153,8 +1161,12 @@ static void __svc_rdma_free(struct work_struct *work)
}
/* Warn if we leaked a resource or under-referenced */
- WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
- WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+ if (atomic_read(&rdma->sc_ctxt_used) != 0)
+ pr_err("svcrdma: ctxt still in use? (%d)\n",
+ atomic_read(&rdma->sc_ctxt_used));
+ if (atomic_read(&rdma->sc_dma_used) != 0)
+ pr_err("svcrdma: dma still in use? (%d)\n",
+ atomic_read(&rdma->sc_dma_used));
/* De-allocate fastreg mr */
rdma_dealloc_frmr_q(rdma);
@@ -1254,7 +1266,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
return -ENOTCONN;
- BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
wr_count = 1;
for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
wr_count++;