diff options
Diffstat (limited to 'fs')
44 files changed, 1379 insertions, 779 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index fe3f59c14a02..333a7bb4cb9c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,6 +432,9 @@ static void init_once(void *foo) mutex_init(&bdev->bd_mutex); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_disks); +#endif inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -779,6 +782,23 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, } #ifdef CONFIG_SYSFS +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + static int add_symlink(struct kobject *from, struct kobject *to) { return sysfs_create_link(from, to, kobject_name(to)); @@ -794,6 +814,8 @@ static void del_symlink(struct kobject *from, struct kobject *to) * @bdev: the claimed slave bdev * @disk: the holding disk * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * * This functions creates the following sysfs symlinks. * * - from "slaves" directory of the holder @disk to the claimed @bdev @@ -817,47 +839,83 @@ static void del_symlink(struct kobject *from, struct kobject *to) */ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { + struct bd_holder_disk *holder; int ret = 0; mutex_lock(&bdev->bd_mutex); - WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk); + WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) goto out_unlock; - ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - if (ret) + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; goto out_unlock; + } - ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); - if (ret) { - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; goto out_unlock; } - bdev->bd_holder_disk = disk; + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +out_free: + kfree(holder); out_unlock: mutex_unlock(&bdev->bd_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); -static void bd_unlink_disk_holder(struct block_device *bdev) +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { - struct gendisk *disk = bdev->bd_holder_disk; + struct bd_holder_disk *holder; - bdev->bd_holder_disk = NULL; - if (!disk) - return; + mutex_lock(&bdev->bd_mutex); - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + holder = bd_find_holder_disk(bdev, disk); + + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, + &disk_to_dev(disk)->kobj); + list_del_init(&holder->list); + kfree(holder); + } + + mutex_unlock(&bdev->bd_mutex); } -#else -static inline void bd_unlink_disk_holder(struct block_device *bdev) -{ } +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif /** @@ -1380,7 +1438,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * unblock evpoll if it was a write holder. */ if (bdev_free) { - bd_unlink_disk_holder(bdev); if (bdev->bd_write_holder) { disk_unblock_events(bdev->bd_disk); bdev->bd_write_holder = false; diff --git a/fs/dcache.c b/fs/dcache.c index 51f7bb6463af..9f493ee4dcba 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1357,8 +1357,8 @@ EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { - BUG_ON(dentry->d_op); - BUG_ON(dentry->d_flags & (DCACHE_OP_HASH | + WARN_ON_ONCE(dentry->d_op); + WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE )); diff --git a/fs/locks.c b/fs/locks.c index 08415b2a6d36..0f3998291f78 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl) fl->fl_file->f_owner.signum = 0; } -static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try) -{ - return fl->fl_file == try->fl_file; -} - static const struct lock_manager_operations lease_manager_ops = { .fl_break = lease_break_callback, .fl_release_private = lease_release_private_callback, - .fl_mylease = lease_mylease_callback, .fl_change = lease_modify, }; @@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (lease->fl_lmops->fl_mylease(fl, lease)) + if (fl->fl_file == filp) my_before = before; else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) /* diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h new file mode 100644 index 000000000000..34e5c40af5ef --- /dev/null +++ b/fs/nfsd/acl.h @@ -0,0 +1,59 @@ +/* + * Common NFSv4 ACL handling definitions. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFS4_ACL_H +#define LINUX_NFS4_ACL_H + +#include <linux/posix_acl.h> + +/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to + * fit in a page: */ +#define NFS4_ACL_MAX 170 + +struct nfs4_acl *nfs4_acl_new(int); +int nfs4_acl_get_whotype(char *, u32); +int nfs4_acl_write_who(int who, char *p); +int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, + uid_t who, u32 mask); + +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 + +struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, + struct posix_acl *, unsigned int flags); +int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, + struct posix_acl **, unsigned int flags); + +#endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c0fcb7ab7f6d..8b31e5f8795d 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1,4 +1,3 @@ -#define MSNFS /* HACK HACK */ /* * NFS exporting and validation. * @@ -1444,9 +1443,6 @@ static struct flags { { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, -#ifdef MSNFS - { NFSEXP_MSNFS, {"msnfs", ""}}, -#endif { 0, {"", ""}} }; diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h new file mode 100644 index 000000000000..2f3be1321534 --- /dev/null +++ b/fs/nfsd/idmap.h @@ -0,0 +1,62 @@ +/* + * Mapping of UID to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. +> * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFSD_IDMAP_H +#define LINUX_NFSD_IDMAP_H + +#include <linux/in.h> +#include <linux/sunrpc/svc.h> + +/* XXX from linux/nfs_idmap.h */ +#define IDMAP_NAMESZ 128 + +#ifdef CONFIG_NFSD_V4 +int nfsd_idmap_init(void); +void nfsd_idmap_shutdown(void); +#else +static inline int nfsd_idmap_init(void) +{ + return 0; +} +static inline void nfsd_idmap_shutdown(void) +{ +} +#endif + +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); +int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); +int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); + +#endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 5b7e3021e06b..2247fc91d5e9 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, __be32 nfserr; u32 max_blocksize = svc_max_payload(rqstp); - dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", + dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, - (unsigned long) argp->offset); + (unsigned long long) argp->offset); /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) @@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, __be32 nfserr; unsigned long cnt = argp->len; - dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", + dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), argp->len, - (unsigned long) argp->offset, + (unsigned long long) argp->offset, argp->stable? " stable" : ""); fh_copy(&resp->fh, &argp->fh); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index e48052615159..ad88f1c0a4c3 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,7 +36,7 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> -#include <linux/nfs4_acl.h> +#include "acl.h" /* mode bit translations: */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 21a63da305ff..3be975e18919 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -628,10 +628,8 @@ static int max_cb_time(void) return max(nfsd4_lease/10, (time_t)1) * HZ; } -/* Reference counting, callback cleanup, etc., all look racy as heck. - * And why is cl_cb_set an atomic? */ -int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { struct rpc_timeout timeparms = { .to_initval = max_cb_time(), @@ -641,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) .net = &init_net, .address = (struct sockaddr *) &conn->cb_addr, .addrsize = conn->cb_addrlen, + .saddress = (struct sockaddr *) &conn->cb_saddr, .timeout = &timeparms, .program = &cb_program, .version = 0, @@ -657,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) args.protocol = XPRT_TRANSPORT_TCP; clp->cl_cb_ident = conn->cb_ident; } else { + if (!conn->cb_xprt) + return -EINVAL; + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; + clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; args.protocol = XPRT_TRANSPORT_BC_TCP; @@ -679,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason) (int)clp->cl_name.len, clp->cl_name.data, reason); } +static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_DOWN; + warn_no_callback_path(clp, reason); +} + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) - warn_no_callback_path(clp, task->tk_status); + nfsd4_mark_cb_down(clp, task->tk_status); else - atomic_set(&clp->cl_cb_set, 1); + clp->cl_cb_state = NFSD4_CB_UP; } static const struct rpc_call_ops nfsd4_cb_probe_ops = { @@ -709,6 +718,11 @@ int set_callback_cred(void) static struct workqueue_struct *callback_wq; +static void run_nfsd4_cb(struct nfsd4_callback *cb) +{ + queue_work(callback_wq, &cb->cb_work); +} + static void do_probe_callback(struct nfs4_client *clp) { struct nfsd4_callback *cb = &clp->cl_cb_null; @@ -723,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp) cb->cb_ops = &nfsd4_cb_probe_ops; - queue_work(callback_wq, &cb->cb_work); + run_nfsd4_cb(cb); } /* @@ -732,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp) */ void nfsd4_probe_callback(struct nfs4_client *clp) { + /* XXX: atomicity? Also, should we be using cl_cb_flags? */ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); do_probe_callback(clp); } -void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +void nfsd4_probe_callback_sync(struct nfs4_client *clp) { - BUG_ON(atomic_read(&clp->cl_cb_set)); + nfsd4_probe_callback(clp); + flush_workqueue(callback_wq); +} +void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; spin_lock(&clp->cl_lock); memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); spin_unlock(&clp->cl_lock); @@ -750,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) * If the slot is available, then mark it busy. Otherwise, set the * thread for sleeping on the callback RPC wait queue. */ -static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, - struct rpc_task *task) +static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) { - u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data; - int status = 0; - - dprintk("%s: %u:%u:%u:%u\n", __func__, - ptr[0], ptr[1], ptr[2], ptr[3]); - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); dprintk("%s slot is busy\n", __func__); - status = -EAGAIN; - goto out; + return false; } -out: - dprintk("%s status=%d\n", __func__, status); - return status; + return true; } /* @@ -780,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); struct nfs4_client *clp = dp->dl_client; u32 minorversion = clp->cl_minorversion; - int status = 0; cb->cb_minorversion = minorversion; if (minorversion) { - status = nfsd41_cb_setup_sequence(clp, task); - if (status) { - if (status != -EAGAIN) { - /* terminate rpc task */ - task->tk_status = status; - task->tk_action = NULL; - } + if (!nfsd41_cb_get_slot(clp, task)) return; - } } + spin_lock(&clp->cl_lock); + if (list_empty(&cb->cb_per_client)) { + /* This is the first call, not a restart */ + cb->cb_done = false; + list_add(&cb->cb_per_client, &clp->cl_callbacks); + } + spin_unlock(&clp->cl_lock); rpc_call_start(task); } @@ -829,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) nfsd4_cb_done(task, calldata); - if (current_rpc_client == NULL) { - /* We're shutting down; give up. */ - /* XXX: err, or is it ok just to fall through - * and rpc_restart_call? */ + if (current_rpc_client != task->tk_client) { + /* We're shutting down or changing cl_cb_client; leave + * it to nfsd4_process_cb_update to restart the call if + * necessary. */ return; } + if (cb->cb_done) + return; switch (task->tk_status) { case 0: + cb->cb_done = true; return; case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: @@ -846,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) break; default: /* Network partition? */ - atomic_set(&clp->cl_cb_set, 0); - warn_no_callback_path(clp, task->tk_status); - if (current_rpc_client != task->tk_client) { - /* queue a callback on the new connection: */ - atomic_inc(&dp->dl_count); - nfsd4_cb_recall(dp); - return; - } + nfsd4_mark_cb_down(clp, task->tk_status); } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; rpc_restart_call_prepare(task); return; - } else { - atomic_set(&clp->cl_cb_set, 0); - warn_no_callback_path(clp, task->tk_status); } + nfsd4_mark_cb_down(clp, task->tk_status); + cb->cb_done = true; } static void nfsd4_cb_recall_release(void *calldata) { struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - nfs4_put_delegation(dp); + if (cb->cb_done) { + spin_lock(&clp->cl_lock); + list_del(&cb->cb_per_client); + spin_unlock(&clp->cl_lock); + nfs4_put_delegation(dp); + } } static const struct rpc_call_ops nfsd4_cb_recall_ops = { @@ -906,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) flush_workqueue(callback_wq); } -void nfsd4_release_cb(struct nfsd4_callback *cb) +static void nfsd4_release_cb(struct nfsd4_callback *cb) { if (cb->cb_ops->rpc_release) cb->cb_ops->rpc_release(cb); } -void nfsd4_process_cb_update(struct nfsd4_callback *cb) +/* requires cl_lock: */ +static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) +{ + struct nfsd4_session *s; + struct nfsd4_conn *c; + + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_flags & NFS4_CDFC4_BACK) + return c; + } + } + return NULL; +} + +static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { struct nfs4_cb_conn conn; struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = NULL; + struct nfsd4_conn *c; int err; /* @@ -926,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb) rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; } + if (clp->cl_cb_conn.cb_xprt) { + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + clp->cl_cb_conn.cb_xprt = NULL; + } if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) return; spin_lock(&clp->cl_lock); @@ -936,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb) BUG_ON(!clp->cl_cb_flags); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); + c = __nfsd4_find_backchannel(clp); + if (c) { + svc_xprt_get(c->cn_xprt); + conn.cb_xprt = c->cn_xprt; + ses = c->cn_session; + } spin_unlock(&clp->cl_lock); - err = setup_callback_client(clp, &conn); - if (err) + err = setup_callback_client(clp, &conn, ses); + if (err) { warn_no_callback_path(clp, err); + return; + } + /* Yay, the callback channel's back! Restart any callbacks: */ + list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) + run_nfsd4_cb(cb); } void nfsd4_do_callback_rpc(struct work_struct *w) @@ -965,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w) void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; + struct nfs4_client *clp = dp->dl_client; dp->dl_retries = 1; cb->cb_op = dp; - cb->cb_clp = dp->dl_client; + cb->cb_clp = clp; cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; @@ -977,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp) cb->cb_ops = &nfsd4_cb_recall_ops; dp->dl_retries = 1; - queue_work(callback_wq, &dp->dl_recall.cb_work); + INIT_LIST_HEAD(&cb->cb_per_client); + cb->cb_done = true; + + run_nfsd4_cb(&dp->dl_recall); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index f0695e815f0e..6d2c397d458b 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -33,10 +33,11 @@ */ #include <linux/module.h> -#include <linux/nfsd_idmap.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> +#include "idmap.h" +#include "nfsd.h" /* * Cache entry @@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp) return clp->name; } -static int +static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) { @@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen int ret; if (namelen + 1 > sizeof(key.name)) - return -EINVAL; + return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); if (ret == -ENOENT) - ret = -ESRCH; /* nfserr_badname */ + return nfserr_badowner; if (ret) - return ret; + return nfserrno(ret); *id = item->id; cache_put(&item->h, &nametoid_cache); return 0; @@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) return ret; } -int +__be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, __u32 *id) { return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); } -int +__be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, __u32 *id) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0cdfd022bb7b..db52546143d1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } -static __be32 -nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - void *arg) +static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) { struct svc_fh tmp_fh; __be32 ret; @@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = exp_pseudoroot(rqstp, &tmp_fh); if (ret) return ret; - if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { + if (tmp_fh.fh_dentry == fh->fh_dentry) { fh_put(&tmp_fh); return nfserr_noent; } fh_put(&tmp_fh); - return nfsd_lookup(rqstp, &cstate->current_fh, - "..", 2, &cstate->current_fh); + return nfsd_lookup(rqstp, fh, "..", 2, fh); +} + +static __be32 +nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + return nfsd4_do_lookupp(rqstp, &cstate->current_fh); } static __be32 @@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else secinfo->si_exp = exp; dput(dentry); + if (cstate->minorversion) + /* See rfc 5661 section 2.6.3.1.1.8 */ + fh_put(&cstate->current_fh); return err; } static __be32 +nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo_no_name *sin) +{ + __be32 err; + + switch (sin->sin_style) { + case NFS4_SECINFO_STYLE4_CURRENT_FH: + break; + case NFS4_SECINFO_STYLE4_PARENT: + err = nfsd4_do_lookupp(rqstp, &cstate->current_fh); + if (err) + return err; + break; + default: + return nfserr_inval; + } + exp_get(cstate->current_fh.fh_export); + sin->sin_exp = cstate->current_fh.fh_export; + fh_put(&cstate->current_fh); + return nfs_ok; +} + +static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) { @@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum); * Also note, enforced elsewhere: * - SEQUENCE other than as first op results in * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) - * - BIND_CONN_TO_SESSION must be the only op in its compound - * (Will be enforced in nfsd4_bind_conn_to_session().) + * - BIND_CONN_TO_SESSION must be the only op in its compound. + * (Enforced in nfsd4_bind_conn_to_session().) * - DESTROY_SESSION must be the final operation in a compound, if * sessionid's in SEQUENCE and DESTROY_SESSION are the same. * (Enforced in nfsd4_destroy_session().) @@ -1126,10 +1156,6 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } - if (!rqstp->rq_usedeferral && status == nfserr_dropit) { - dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__); - status = nfserr_jukebox; - } resp->cstate.status = status; fh_put(&resp->cstate.current_fh); @@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, .op_name = "OP_EXCHANGE_ID", }, + [OP_BIND_CONN_TO_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_BIND_CONN_TO_SESSION", + }, [OP_CREATE_SESSION] = { .op_func = (nfsd4op_func)nfsd4_create_session, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, @@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_RECLAIM_COMPLETE", }, + [OP_SECINFO_NO_NAME] = { + .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_name = "OP_SECINFO_NO_NAME", + }, }; static const char *nfsd4_op_name(unsigned opnum) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 7e26caab2a26..ffb59ef6f82f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child) { int status; - /* note: we currently use this path only for minorversion 0 */ if (nfs4_has_reclaimed_state(child->d_name.name, false)) return 0; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fbd18c3074bb..d98d0213285d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; - nfs4_file_get_access(fp, O_RDONLY); + dp->dl_vfs_file = find_readable_file(fp); + get_file(dp->dl_vfs_file); dp->dl_flock = NULL; dp->dl_type = type; dp->dl_stateid.si_boot = boot_time; @@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); + fput(dp->dl_vfs_file); kmem_cache_free(deleg_slab, dp); num_delegations--; } @@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp) static void nfs4_close_delegation(struct nfs4_delegation *dp) { - struct file *filp = find_readable_file(dp->dl_file); - dprintk("NFSD: close_delegation dp %p\n",dp); + /* XXX: do we even need this check?: */ if (dp->dl_flock) - vfs_setlease(filp, F_UNLCK, &dp->dl_flock); - nfs4_file_put_access(dp->dl_file, O_RDONLY); + vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock); } /* Called under the state lock. */ @@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) free_conn(c); } spin_unlock(&clp->cl_lock); + nfsd4_probe_callback(clp); } static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) @@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn) return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } -static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) +static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir) { struct nfsd4_conn *conn; - u32 flags = NFS4_CDFC4_FORE; int ret; - if (ses->se_flags & SESSION4_BACK_CHAN) - flags |= NFS4_CDFC4_BACK; - conn = alloc_conn(rqstp, flags); + conn = alloc_conn(rqstp, dir); if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); @@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) return nfs_ok; } +static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses) +{ + u32 dir = NFS4_CDFC4_FORE; + + if (ses->se_flags & SESSION4_BACK_CHAN) + dir |= NFS4_CDFC4_BACK; + + return nfsd4_new_conn(rqstp, ses, dir); +} + +/* must be called under client_lock */ static void nfsd4_del_conns(struct nfsd4_session *s) { struct nfs4_client *clp = s->se_client; @@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n */ slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); + if (numslots < 1) + return NULL; new = alloc_session(slotsize, numslots); if (!new) { @@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n idx = hash_sessionid(&new->se_sessionid); spin_lock(&client_lock); list_add(&new->se_hash, &sessionid_hashtbl[idx]); + spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&clp->cl_lock); spin_unlock(&client_lock); - status = nfsd4_new_conn(rqstp, new); + status = nfsd4_new_conn_from_crses(rqstp, new); /* whoops: benny points out, status is ignored! (err, or bogus) */ if (status) { free_session(&new->se_ref); return NULL; } - if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) { + if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); - - clp->cl_cb_session = new; - clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt; - svc_xprt_get(rqstp->rq_xprt); + /* + * This is a little silly; with sessions there's no real + * use for the callback address. Use the peer address + * as a reasonable default for now, but consider fixing + * the rpc client not to require an address in the + * future: + */ rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); - nfsd4_probe_callback(clp); } + nfsd4_probe_callback(clp); return new; } @@ -817,7 +833,9 @@ static void unhash_session(struct nfsd4_session *ses) { list_del(&ses->se_hash); + spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); + spin_unlock(&ses->se_client->cl_lock); } /* must be called under the client_lock */ @@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp) mark_client_expired(clp); list_del(&clp->cl_lru); + spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) list_del_init(&ses->se_hash); + spin_unlock(&clp->cl_lock); } static void @@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); - atomic_set(&clp->cl_cb_set, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); spin_lock_init(&clp->cl_lock); INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); clp->cl_time = get_seconds(); @@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid) return NULL; } -/* - * Return 1 iff clp's clientid establishment method matches the use_exchange_id - * parameter. Matching is based on the fact the at least one of the - * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1 - * - * FIXME: we need to unify the clientid namespaces for nfsv4.x - * and correctly deal with client upgrade/downgrade in EXCHANGE_ID - * and SET_CLIENTID{,_CONFIRM} - */ -static inline int -match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id) +static bool clp_used_exchangeid(struct nfs4_client *clp) { - bool has_exchange_flags = (clp->cl_exchange_flags != 0); - return use_exchange_id == has_exchange_flags; -} + return clp->cl_exchange_flags != 0; +} static struct nfs4_client * -find_confirmed_client_by_str(const char *dname, unsigned int hashval, - bool use_exchange_id) +find_confirmed_client_by_str(const char *dname, unsigned int hashval) { struct nfs4_client *clp; list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname) && - match_clientid_establishment(clp, use_exchange_id)) + if (same_name(clp->cl_recdir, dname)) return clp; } return NULL; } static struct nfs4_client * -find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, - bool use_exchange_id) +find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) { struct nfs4_client *clp; list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname) && - match_clientid_establishment(clp, use_exchange_id)) + if (same_name(clp->cl_recdir, dname)) return clp; } return NULL; } +static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) +{ + switch (family) { + case AF_INET: + ((struct sockaddr_in *)sa)->sin_family = AF_INET; + ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; + return; + case AF_INET6: + ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; + ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; + return; + } +} + static void -gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { struct nfs4_cb_conn *conn = &clp->cl_cb_conn; + struct sockaddr *sa = svc_addr(rqstp); + u32 scopeid = rpc_get_scope_id(sa); unsigned short expected_family; /* Currently, we only support tcp and tcp6 for the callback channel */ @@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; + rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; @@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, case SP4_NONE: break; case SP4_SSV: - return nfserr_encr_alg_unsupp; + return nfserr_serverfault; default: BUG(); /* checked by xdr code */ case SP4_MACH_CRED: @@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, nfs4_lock_state(); status = nfs_ok; - conf = find_confirmed_client_by_str(dname, strhashval, true); + conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { + if (!clp_used_exchangeid(conf)) { + status = nfserr_clid_inuse; /* XXX: ? */ + goto out; + } if (!same_verf(&verf, &conf->cl_verifier)) { /* 18.35.4 case 8 */ if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { @@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, goto out; } - unconf = find_unconfirmed_client_by_str(dname, strhashval, true); + unconf = find_unconfirmed_client_by_str(dname, strhashval); if (unconf) { /* * Possible retry or client restart. Per 18.35.4 case 4, @@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfs_ok; memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); + memcpy(&cr_ses->fore_channel, &new->se_fchannel, + sizeof(struct nfsd4_channel_attrs)); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; @@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) return argp->opcnt == resp->opcnt; } +static __be32 nfsd4_map_bcts_dir(u32 *dir) +{ + switch (*dir) { + case NFS4_CDFC4_FORE: + case NFS4_CDFC4_BACK: + return nfs_ok; + case NFS4_CDFC4_FORE_OR_BOTH: + case NFS4_CDFC4_BACK_OR_BOTH: + *dir = NFS4_CDFC4_BOTH; + return nfs_ok; + }; + return nfserr_inval; +} + +__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 status; + + if (!nfsd4_last_compound_op(rqstp)) + return nfserr_not_only_op; + spin_lock(&client_lock); + cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); + /* Sorta weird: we only need the refcnt'ing because new_conn acquires + * client_lock iself: */ + if (cstate->session) { + nfsd4_get_session(cstate->session); + atomic_inc(&cstate->session->se_client->cl_refcount); + } + spin_unlock(&client_lock); + if (!cstate->session) + return nfserr_badsession; + + status = nfsd4_map_bcts_dir(&bcts->dir); + nfsd4_new_conn(rqstp, cstate->session, bcts->dir); + return nfs_ok; +} + static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) { if (!session) @@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r, spin_unlock(&client_lock); nfs4_lock_state(); - /* wait for callbacks */ - nfsd4_shutdown_callback(ses->se_client); + nfsd4_probe_callback_sync(ses->se_client); nfs4_unlock_state(); nfsd4_del_conns(ses); @@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, out: /* Hold a session reference until done processing the compound. */ if (cstate->session) { + struct nfs4_client *clp = session->se_client; + nfsd4_get_session(cstate->session); - atomic_inc(&session->se_client->cl_refcount); + atomic_inc(&clp->cl_refcount); + if (clp->cl_cb_state == NFSD4_CB_DOWN) + seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; } kfree(conn); spin_unlock(&client_lock); @@ -1775,7 +1846,6 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct sockaddr *sa = svc_addr(rqstp); struct xdr_netobj clname = { .len = setclid->se_namelen, .data = setclid->se_name, @@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, strhashval = clientstr_hashval(dname); nfs4_lock_state(); - conf = find_confirmed_client_by_str(dname, strhashval, false); + conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; + if (clp_used_exchangeid(conf)) + goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { char addr_str[INET6_ADDRSTRLEN]; rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, @@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * has a description of SETCLIENTID request processing consisting * of 5 bullet points, labeled as CASE0 - CASE4 below. */ - unconf = find_unconfirmed_client_by_str(dname, strhashval, false); + unconf = find_unconfirmed_client_by_str(dname, strhashval); status = nfserr_resource; if (!conf) { /* @@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * for consistent minorversion use throughout: */ new->cl_minorversion = 0; - gen_callback(new, setclid, rpc_get_scope_id(sa)); + gen_callback(new, setclid, rqstp); add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; @@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { - atomic_set(&conf->cl_cb_set, 0); nfsd4_change_callback(conf, &unconf->cl_cb_conn); nfsd4_probe_callback(conf); expire_client(unconf); @@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unsigned int hash = clientstr_hashval(unconf->cl_recdir); conf = find_confirmed_client_by_str(unconf->cl_recdir, - hash, false); + hash); if (conf) { nfsd4_remove_clid_dir(conf); expire_client(conf); @@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl) nfsd4_cb_recall(dp); } -/* - * The file_lock is being reapd. - * - * Called by locks_free_lock() with lock_flocks() held. - */ -static -void nfsd_release_deleg_cb(struct file_lock *fl) -{ - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; - - dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count)); - - if (!(fl->fl_flags & FL_LEASE) || !dp) - return; - dp->dl_flock = NULL; -} - -/* - * Called from setlease() with lock_flocks() held - */ -static -int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) -{ - struct nfs4_delegation *onlistd = - (struct nfs4_delegation *)onlist->fl_owner; - struct nfs4_delegation *tryd = - (struct nfs4_delegation *)try->fl_owner; - - if (onlist->fl_lmops != try->fl_lmops) - return 0; - - return onlistd->dl_client == tryd->dl_client; -} - - static int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) { @@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) static const struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, - .fl_release_private = nfsd_release_deleg_cb, - .fl_mylease = nfsd_same_client_deleg_cb, .fl_change = nfsd_change_deleg_cb, }; @@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &fp->fi_fds[oflag]); - if (status == nfserr_dropit) - status = nfserr_jukebox; if (status) return status; } @@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open) open->op_stateowner->so_client->cl_firststate = 1; } +/* Should we give out recallable state?: */ +static bool nfsd4_cb_channel_good(struct nfs4_client *clp) +{ + if (clp->cl_cb_state == NFSD4_CB_UP) + return true; + /* + * In the sessions case, since we don't have to establish a + * separate connection for callbacks, we assume it's OK + * until we hear otherwise: + */ + return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; +} + /* * Attempt to hand out a delegation. */ @@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta { struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; - int cb_up = atomic_read(&sop->so_client->cl_cb_set); + int cb_up; struct file_lock *fl; int status, flag = 0; + cb_up = nfsd4_cb_channel_good(sop->so_client); flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { @@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta dp->dl_flock = fl; /* vfs_setlease checks to see if delegation should be handed out. - * the lock_manager callbacks fl_mylease and fl_change are used + * the lock_manager callback fl_change is used */ if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); @@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) - && !atomic_read(&clp->cl_cb_set)) + && clp->cl_cb_state != NFSD4_CB_UP) goto out; status = nfs_ok; out: @@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (status) goto out; renew_client(dp->dl_client); - if (filpp) + if (filpp) { *filpp = find_readable_file(dp->dl_file); - BUG_ON(!*filpp); + BUG_ON(!*filpp); + } } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); if (!stp) @@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) unsigned int strhashval = clientstr_hashval(name); struct nfs4_client *clp; - clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); + clp = find_confirmed_client_by_str(name, strhashval); return clp ? 1 : 0; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f35a94a04026..956629b9cdc9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -44,13 +44,14 @@ #include <linux/namei.h> #include <linux/statfs.h> #include <linux/utsname.h> -#include <linux/nfsd_idmap.h> -#include <linux/nfs4_acl.h> #include <linux/sunrpc/svcauth_gss.h> +#include "idmap.h" +#include "acl.h" #include "xdr4.h" #include "vfs.h" + #define NFSDDBG_FACILITY NFSDDBG_XDR /* @@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); ace->whotype = nfs4_acl_get_whotype(buf, dummy32); - host_err = 0; + status = nfs_ok; if (ace->whotype != NFS4_ACL_WHO_NAMED) ace->who = 0; else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - host_err = nfsd_map_name_to_gid(argp->rqstp, + status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &ace->who); else - host_err = nfsd_map_name_to_uid(argp->rqstp, + status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &ace->who); - if (host_err) - goto out_nfserr; + if (status) + return status; } } else *acl = NULL; @@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access DECODE_TAIL; } +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + DECODE_HEAD; + u32 dummy; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); + COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + READ32(bcts->dir); + /* XXX: Perhaps Tom Tucker could help us figure out how we + * should be using ctsa_use_conn_in_rdma_mode: */ + READ32(dummy); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { @@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, } static __be32 +nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(sin->sin_style); + DECODE_TAIL; +} + +static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { __be32 status; @@ -1005,7 +1032,7 @@ static __be32 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, struct nfsd4_exchange_id *exid) { - int dummy; + int dummy, tmp; DECODE_HEAD; READ_BUF(NFS4_VERIFIER_SIZE); @@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, /* ssp_hash_algs<> */ READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); + READ32(tmp); + while (tmp--) { + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } /* ssp_encr_algs<> */ READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); + READ32(tmp); + while (tmp--) { + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } /* ssp_window and ssp_num_gss_handles */ READ_BUF(8); @@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { /* new operations for NFSv4.1 */ [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, @@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, case nfserr_resource: nfserr = nfserr_toosmall; goto fail; - case nfserr_dropit: - goto fail; case nfserr_noent: goto skip_entry; default: @@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ return nfserr; } +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); + WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(bcts->dir); + /* XXX: ? */ + WRITE32(0); + ADJUST_ARGS(); + } + return nfserr; +} + static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { @@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo *secinfo) +nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, + __be32 nfserr,struct svc_export *exp) { int i = 0; - struct svc_export *exp = secinfo->si_exp; u32 nflavs; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; @@ -2892,6 +2939,20 @@ out: return nfserr; } +static __be32 +nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo *secinfo) +{ + return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp); +} + +static __be32 +nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo_no_name *secinfo) +{ + return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp); +} + /* * The SETATTR encode routine is special -- it always encodes a bitmap, * regardless of the error status. @@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITE32(seq->seqid); WRITE32(seq->slotid); WRITE32(seq->maxslots); - /* - * FIXME: for now: - * target_maxslots = maxslots - * status_flags = 0 - */ + /* For now: target_maxslots = maxslots */ WRITE32(seq->maxslots); - WRITE32(0); + WRITE32(seq->status_flags); ADJUST_ARGS(); resp->cstate.datap = p; /* DRC cache data pointer */ @@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* NFSv4.1 operations */ [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, @@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4514ebbee4d6..33b3e2b06779 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -8,12 +8,12 @@ #include <linux/namei.h> #include <linux/ctype.h> -#include <linux/nfsd_idmap.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> +#include "idmap.h" #include "nfsd.h" #include "cache.h" @@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { +#ifdef CONFIG_NFSD_DEPRECATED static int warned; if (file->f_dentry->d_name.name[0] == '.' && !warned) { printk(KERN_INFO @@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size current->comm, file->f_dentry->d_name.name); warned = 1; } +#endif if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 6b641cf2c19a..7ecfa2420307 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) #define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) #define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER) #define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) #define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) #define nfserr_grace cpu_to_be32(NFSERR_GRACE) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 08e17264784b..e15dc45fc5ec 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -735,9 +735,9 @@ nfserrno (int errno) { nfserr_stale, -ESTALE }, { nfserr_jukebox, -ETIMEDOUT }, { nfserr_jukebox, -ERESTARTSYS }, - { nfserr_dropit, -EAGAIN }, - { nfserr_dropit, -ENOMEM }, - { nfserr_badname, -ESRCH }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, { nfserr_io, -ETXTBSY }, { nfserr_notsupp, -EOPNOTSUPP }, { nfserr_toosmall, -ETOOSMALL }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2bae1d86f5f2..18743c4d8bca 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit) { + if (nfserr == nfserr_dropit || rqstp->rq_dropme) { dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); return 0; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 39adc27b0685..3074656ba7bf 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,10 +68,12 @@ typedef struct { struct nfsd4_callback { void *cb_op; struct nfs4_client *cb_clp; + struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; const struct rpc_call_ops *cb_ops; struct work_struct cb_work; + bool cb_done; }; struct nfs4_delegation { @@ -81,6 +83,7 @@ struct nfs4_delegation { atomic_t dl_count; /* ref count */ struct nfs4_client *dl_client; struct nfs4_file *dl_file; + struct file *dl_vfs_file; struct file_lock *dl_flock; u32 dl_type; time_t dl_time; @@ -95,6 +98,7 @@ struct nfs4_delegation { struct nfs4_cb_conn { /* SETCLIENTID info */ struct sockaddr_storage cb_addr; + struct sockaddr_storage cb_saddr; size_t cb_addrlen; u32 cb_prog; /* used only in 4.0 case; per-session otherwise */ @@ -146,6 +150,11 @@ struct nfsd4_create_session { u32 gid; }; +struct nfsd4_bind_conn_to_session { + struct nfs4_sessionid sessionid; + u32 dir; +}; + /* The single slot clientid cache structure */ struct nfsd4_clid_slot { u32 sl_seqid; @@ -235,9 +244,13 @@ struct nfs4_client { unsigned long cl_cb_flags; struct rpc_clnt *cl_cb_client; u32 cl_cb_ident; - atomic_t cl_cb_set; +#define NFSD4_CB_UP 0 +#define NFSD4_CB_UNKNOWN 1 +#define NFSD4_CB_DOWN 2 + int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; + struct list_head cl_callbacks; /* list of in-progress callbacks */ /* for all client information that callback code might need: */ spinlock_t cl_lock; @@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid); extern void nfs4_free_stateowner(struct kref *kref); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_do_callback_rpc(struct work_struct *); extern void nfsd4_cb_recall(struct nfs4_delegation *dp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0f79e33a65d7..641117f2188d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1,4 +1,3 @@ -#define MSNFS /* HACK HACK */ /* * File operations used by nfsd. Some of these have been ripped from * other parts of the kernel because they weren't exported, others @@ -35,8 +34,8 @@ #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 -#include <linux/nfs4_acl.h> -#include <linux/nfsd_idmap.h> +#include "acl.h" +#include "idmap.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" @@ -274,6 +273,13 @@ out: return err; } +static int nfsd_break_lease(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + return break_lease(inode, O_WRONLY | O_NONBLOCK); +} + /* * Commit metadata changes to stable storage. */ @@ -376,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, goto out; } - /* - * If we are changing the size of the file, then - * we need to break all leases. - */ - host_err = break_lease(inode, O_WRONLY | O_NONBLOCK); - if (host_err == -EWOULDBLOCK) - host_err = -ETIMEDOUT; - if (host_err) /* ENOMEM or EWOULDBLOCK */ - goto out_nfserr; - host_err = get_write_access(inode); if (host_err) goto out_nfserr; @@ -426,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, err = nfserr_notsync; if (!check_guard || guardtime == inode->i_ctime.tv_sec) { + host_err = nfsd_break_lease(inode); + if (host_err) + goto out_nfserr; fh_lock(fhp); + host_err = notify_change(dentry, iap); err = nfserrno(host_err); fh_unlock(fhp); @@ -753,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, */ if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); - if (host_err == -EWOULDBLOCK) - host_err = -ETIMEDOUT; if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; @@ -875,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static inline int svc_msnfs(struct svc_fh *ffhp) -{ -#ifdef MSNFS - return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS); -#else - return 0; -#endif -} - static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) @@ -896,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, err = nfserr_perm; inode = file->f_path.dentry->d_inode; - if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) - goto out; - if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { .len = 0, @@ -923,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, fsnotify_access(file); } else err = nfserrno(host_err); -out: return err; } @@ -988,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; -#ifdef MSNFS - err = nfserr_perm; - - if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt))) - goto out; -#endif - dentry = file->f_path.dentry; inode = dentry->d_inode; exp = fhp->fh_export; @@ -1046,7 +1023,6 @@ out_nfserr: err = 0; else err = nfserrno(host_err); -out: return err; } @@ -1666,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserrno(host_err); goto out_dput; } + err = nfserr_noent; + if (!dold->d_inode) + goto out_drop_write; + host_err = nfsd_break_lease(dold->d_inode); + if (host_err) + goto out_drop_write; host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); @@ -1677,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } +out_drop_write: mnt_drop_write(tfhp->fh_export->ex_path.mnt); out_dput: dput(dnew); @@ -1751,12 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ndentry == trap) goto out_dput_new; - if (svc_msnfs(ffhp) && - ((odentry->d_count > 1) || (ndentry->d_count > 1))) { - host_err = -EPERM; - goto out_dput_new; - } - host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; @@ -1764,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (host_err) goto out_dput_new; + host_err = nfsd_break_lease(odentry->d_inode); + if (host_err) + goto out_drop_write; host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) host_err = commit_metadata(ffhp); } - +out_drop_write: mnt_drop_write(ffhp->fh_export->ex_path.mnt); - out_dput_new: dput(ndentry); out_dput_old: @@ -1835,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (host_err) goto out_nfserr; - if (type != S_IFDIR) { /* It's UNLINK */ -#ifdef MSNFS - if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (rdentry->d_count > 1)) { - host_err = -EPERM; - } else -#endif + host_err = nfsd_break_lease(rdentry->d_inode); + if (host_err) + goto out_put; + if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); - } else { /* It's RMDIR */ + else host_err = vfs_rmdir(dirp, rdentry); - } - +out_put: dput(rdentry); if (!host_err) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 60fce3dc5cb5..366401e1a536 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -311,6 +311,11 @@ struct nfsd4_secinfo { struct svc_export *si_exp; /* response */ }; +struct nfsd4_secinfo_no_name { + u32 sin_style; /* request */ + struct svc_export *sin_exp; /* response */ +}; + struct nfsd4_setattr { stateid_t sa_stateid; /* request */ u32 sa_bmval[3]; /* request */ @@ -373,8 +378,8 @@ struct nfsd4_sequence { u32 cachethis; /* request */ #if 0 u32 target_maxslots; /* response */ - u32 status_flags; /* response */ #endif /* not yet */ + u32 status_flags; /* response */ }; struct nfsd4_destroy_session { @@ -422,6 +427,7 @@ struct nfsd4_op { /* NFSv4.1 */ struct nfsd4_exchange_id exchange_id; + struct nfsd4_bind_conn_to_session bind_conn_to_session; struct nfsd4_create_session create_session; struct nfsd4_destroy_session destroy_session; struct nfsd4_sequence sequence; @@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); extern __be32 nfsd4_create_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_create_session *); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0dce969d6cad..faca44997099 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ kmem.o \ xfs_aops.o \ xfs_buf.o \ + xfs_discard.o \ xfs_export.o \ xfs_file.o \ xfs_fs_subr.o \ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 92f1f2acc6ab..ac1c7e8378dd 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -896,7 +896,6 @@ xfs_buf_rele( trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { - ASSERT(!bp->b_relse); ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) @@ -908,11 +907,7 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (bp->b_relse) { - atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); - bp->b_relse(bp); - } else if (!(bp->b_flags & XBF_STALE) && + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { xfs_buf_lru_add(bp); spin_unlock(&pag->pag_buf_lock); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a76c2428faff..cbe65950e524 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -152,8 +152,6 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); -typedef void (*xfs_buf_relse_t)(struct xfs_buf *); -typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 @@ -183,7 +181,6 @@ typedef struct xfs_buf { void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ - xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) -#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) @@ -360,8 +356,7 @@ xfs_buf_set_ref( static inline void xfs_buf_relse(xfs_buf_t *bp) { - if (!bp->b_relse) - xfs_buf_unlock(bp); + xfs_buf_unlock(bp); xfs_buf_rele(bp); } diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c new file mode 100644 index 000000000000..05201ae719e5 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_discard.h" +#include "xfs_trace.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_fsblock_t start, + xfs_fsblock_t len, + xfs_fsblock_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_le(cur, 0, + XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + + /* + * Too small? Give up. + */ + if (flen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || + XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, + XFS_AGB_TO_DADDR(mp, agno, fbno), + XFS_FSB_TO_BB(mp, flen), + GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_fsblock_t start, len, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * XFS_B_TO_FSB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + start = XFS_B_TO_FSBT(mp, range.start); + len = XFS_B_TO_FSBT(mp, range.len); + minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + + start_agno = XFS_FSB_TO_AGNO(mp, start); + if (start_agno >= mp->m_sb.sb_agcount) + return -XFS_ERROR(EINVAL); + + end_agno = XFS_FSB_TO_AGNO(mp, start + len); + if (end_agno >= mp->m_sb.sb_agcount) + end_agno = mp->m_sb.sb_agcount - 1; + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, len, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h new file mode 100644 index 000000000000..e82b6dd3e127 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -0,0 +1,8 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ba8ad422a165..ef51eb43e137 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -41,6 +41,40 @@ static const struct vm_operations_struct xfs_file_vm_ops; /* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* * xfs_iozero * * xfs_iozero clears the specified range of buffer supplied, @@ -262,22 +296,21 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } } - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } else + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -285,7 +318,7 @@ xfs_file_aio_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -309,7 +342,7 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_splice_read(ip, count, *ppos, ioflags); @@ -317,10 +350,61 @@ xfs_file_splice_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occured. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip) +{ + if (ip->i_new_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * xfs_file_splice_write() does not use xfs_rw_ilock() because + * generic_file_splice_write() takes the i_mutex itself. This, in theory, + * couuld cause lock inversions between the aio_write path and the splice path + * if someone is doing concurrent splice(2) based writes and write(2) based + * writes to the same inode. The only real way to fix this is to re-implement + * the generic code here with correct locking orders. + */ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -331,7 +415,7 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize, new_size; + xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -355,27 +439,9 @@ xfs_file_splice_write( trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, ppos, ret); + xfs_aio_write_newsize_update(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -562,245 +628,258 @@ out_lock: return error; } +/* + * Common pre-write limit and setup checks. + * + * Returns with iolock held according to @iolock. + */ STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0, error = 0; - int ioflags = 0; - xfs_fsize_t isize, new_size; - int iolock; - size_t ocount = 0, count; - int need_i_mutex; + xfs_fsize_t new_size; + int error = 0; - XFS_STATS_INC(xs_write_calls); + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return error; + } - BUG_ON(iocb->ki_pos != pos); + new_size = *pos + *count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); - error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. + */ + if (*pos > ip->i_size) + error = -xfs_zero_eof(ip, *pos, ip->i_size); + + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; - count = ocount; - if (count == 0) - return 0; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - } + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); +} -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + size_t count = ocount; + int unaligned_io = 0; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return -XFS_ERROR(EINVAL); + + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + if (unaligned_io || mapping->nrpages || pos > ip->i_size) + *iolock = XFS_IOLOCK_EXCL; + else + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + + if (mapping->nrpages) { + WARN_ON(*iolock != XFS_IOLOCK_EXCL); + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (ret) + return ret; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - goto start; - } + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + xfs_ioend_wait(ip); + else if (*iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + *iolock = XFS_IOLOCK_SHARED; } - new_size = pos + count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} - /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. - */ +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + size_t count = ocount; - if (pos > ip->i_size) { - error = xfs_zero_eof(ip, pos, ip->i_size); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - error = -file_remove_suid(file); - if (unlikely(error)) - goto out_unlock_internal; + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(ip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } - - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we aren't holding any + * page locks and retry *once* + */ + if (ret == -ENOSPC && !enospc) { + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) + return ret; + enospc = 1; + goto write_retry; + } + current->backing_dev_info = NULL; + return ret; +} - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int iolock; + size_t ocount = 0; - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + XFS_STATS_INC(xs_write_calls); - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); + BUG_ON(iocb->ki_pos != pos); - pos += ret; - count -= ret; + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(ip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; + if (ocount == 0) + return 0; -write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } + xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); - current->backing_dev_info = NULL; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) - iocb->ki_pos = isize; + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + else + ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); - if (iocb->ki_pos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (iocb->ki_pos > ip->i_size) - ip->i_size = iocb->ki_pos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); - error = -ret; if (ret <= 0) - goto out_unlock_internal; - - XFS_STATS_ADD(xs_write_bytes, ret); + goto out_unlock; /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; - int error2; - - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); + int error, error2; - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); + xfs_rw_iunlock(ip, iolock); + error = filemap_write_and_wait_range(mapping, pos, end); + xfs_rw_ilock(ip, iolock); error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); - if (!error) - error = error2; + if (error) + ret = error; + else if (error2) + ret = error2; } - out_unlock_internal: - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - xfs_iunlock(ip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; +out_unlock: + xfs_aio_write_newsize_update(ip); + xfs_rw_iunlock(ip, iolock); + return ret; } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index ad442d9e392e..b06ede1d0bed 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" #include "xfs_vnodeops.h" +#include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" @@ -1294,6 +1295,8 @@ xfs_file_ioctl( trace_xfs_file_ioctl(ip); switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bd07f7339366..9731898083ae 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1414,7 +1414,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp, SYNC_WAIT); + return -xfs_fs_log_dummy(mp); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a02480de9759..e22f0057d21f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -362,7 +362,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -503,13 +503,14 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ - error = xfs_qm_sync(mp, SYNC_TRYLOCK); if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp, 0); + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); + error = xfs_qm_sync(mp, SYNC_TRYLOCK); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 7bb5092d6ae4..ee3cee097e7e 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -18,6 +18,7 @@ #include "xfs.h" #include <linux/sysctl.h> #include <linux/proc_fs.h> +#include "xfs_error.h" static struct ctl_table_header *xfs_table_header; @@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler( return ret; } + +STATIC int +xfs_panic_mask_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} #endif /* CONFIG_PROC_FS */ static ctl_table xfs_table[] = { @@ -77,7 +98,7 @@ static ctl_table xfs_table[] = { .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_panic_mask_proc_handler, .extra1 = &xfs_params.panic_mask.min, .extra2 = &xfs_params.panic_mask.max }, diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 647af2a2e7aa..2d0bcb479075 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 975aa10e1a47..e6cf955ec0fc 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -25,86 +25,78 @@ #include "xfs_mount.h" #include "xfs_error.h" -static char message[1024]; /* keep it off the stack */ -static DEFINE_SPINLOCK(xfs_err_lock); - -/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ -#define XFS_MAX_ERR_LEVEL 7 -#define XFS_ERR_MASK ((1 << 3) - 1) -static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = - {KERN_EMERG, KERN_ALERT, KERN_CRIT, - KERN_ERR, KERN_WARNING, KERN_NOTICE, - KERN_INFO, KERN_DEBUG}; - void -cmn_err(register int level, char *fmt, ...) +cmn_err( + const char *lvl, + const char *fmt, + ...) { - char *fp = fmt; - int len; - ulong flags; - va_list ap; - - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - va_start(ap, fmt); - if (*fmt == '!') fp++; - len = vsnprintf(message, sizeof(message), fp, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; - printk("%s%s\n", err_level[level], message); - va_end(ap); - spin_unlock_irqrestore(&xfs_err_lock,flags); - BUG_ON(level == CE_PANIC); + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%pV", lvl, &vaf); + va_end(args); + + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); } void -xfs_fs_vcmn_err( - int level, +xfs_fs_cmn_err( + const char *lvl, struct xfs_mount *mp, - char *fmt, - va_list ap) + const char *fmt, + ...) { - unsigned long flags; - int len = 0; + struct va_format vaf; + va_list args; - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - spin_lock_irqsave(&xfs_err_lock,flags); + printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf); + va_end(args); - if (mp) { - len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); +} + +/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */ +void +xfs_cmn_err( + int panic_tag, + const char *lvl, + struct xfs_mount *mp, + const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + int panic = 0; - /* - * Skip the printk if we can't print anything useful - * due to an over-long device name. - */ - if (len >= sizeof(message)) - goto out; + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + printk(KERN_ALERT "XFS: Transforming an alert into a BUG."); + panic = 1; } - len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - printk("%s%s\n", err_level[level], message); - out: - spin_unlock_irqrestore(&xfs_err_lock,flags); + printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf); + va_end(args); - BUG_ON(level == CE_PANIC); + BUG_ON(panic); } void assfail(char *expr, char *file, int line) { - printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); + printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr, + file, line); BUG(); } diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index d2d20462fd4f..05699f67d475 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -20,15 +20,22 @@ #include <stdarg.h> -#define CE_DEBUG 7 /* debug */ -#define CE_CONT 6 /* continuation */ -#define CE_NOTE 5 /* notice */ -#define CE_WARN 4 /* warning */ -#define CE_ALERT 1 /* alert */ -#define CE_PANIC 0 /* panic */ - -extern void cmn_err(int, char *, ...) - __attribute__ ((format (printf, 2, 3))); +struct xfs_mount; + +#define CE_DEBUG KERN_DEBUG +#define CE_CONT KERN_INFO +#define CE_NOTE KERN_NOTICE +#define CE_WARN KERN_WARNING +#define CE_ALERT KERN_ALERT +#define CE_PANIC KERN_EMERG + +void cmn_err(const char *lvl, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 4, 5))); + extern void assfail(char *expr, char *f, int l); #define ASSERT_ALWAYS(expr) \ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index fa8723f5870a..f3227984a9bf 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,10 +41,6 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -static int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - /* * Prototypes for per-ag allocation routines */ @@ -94,7 +90,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -127,7 +123,7 @@ xfs_alloc_update( /* * Get the data from the pointed-to record. */ -STATIC int /* error */ +int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t *bno, /* output: starting block of extent */ @@ -2615,7 +2611,7 @@ restart: * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ -static int +int xfs_alloc_busy_search( struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 895009a97271..0ab56b32c7eb 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -19,6 +19,7 @@ #define __XFS_ALLOC_H__ struct xfs_buf; +struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; @@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); #ifdef __KERNEL__ - void -xfs_alloc_busy_insert(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); void xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); #endif /* __KERNEL__ */ /* @@ -205,4 +206,18 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ed2b65f3f8b9..98c6f73b6752 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -141,7 +141,6 @@ xfs_buf_item_log_check( #define xfs_buf_item_log_check(x) #endif -STATIC void xfs_buf_error_relse(xfs_buf_t *bp); STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* @@ -959,128 +958,76 @@ xfs_buf_do_callbacks( */ void xfs_buf_iodone_callbacks( - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_log_item_t *lip; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - xfs_mount_t *mp; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (likely(!XFS_BUF_GETERROR(bp))) + goto do_callbacks; - if (XFS_BUF_GETERROR(bp) != 0) { - /* - * If we've already decided to shutdown the filesystem - * because of IO errors, there's no point in giving this - * a retry. - */ - mp = lip->li_mountp; - if (XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - XFS_BUF_SUPER_STALE(bp); - trace_xfs_buf_item_iodone(bp, _RET_IP_); - xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); - return; - } + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + XFS_BUF_SUPER_STALE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } - if ((XFS_BUF_TARGET(bp) != lasttarg) || - (time_after(jiffies, (lasttime + 5*HZ)))) { - lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); - } - lasttarg = XFS_BUF_TARGET(bp); + if (XFS_BUF_TARGET(bp) != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + cmn_err(CE_ALERT, "Device %s, XFS metadata write error" + " block 0x%llx in %s", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + } + lasttarg = XFS_BUF_TARGET(bp); - if (XFS_BUF_ISASYNC(bp)) { - /* - * If the write was asynchronous then noone will be - * looking for the error. Clear the error state - * and write the buffer out again delayed write. - * - * XXXsup This is OK, so long as we catch these - * before we start the umount; we don't want these - * DELWRI metadata bufs to be hanging around. - */ - XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ - - if (!(XFS_BUF_ISSTALE(bp))) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); - } else { - /* - * If the write of the buffer was not asynchronous, - * then we want to make sure to return the error - * to the caller of bwrite(). Because of this we - * cannot clear the B_ERROR state at this point. - * Instead we install a callback function that - * will be called when the buffer is released, and - * that routine will clear the error state and - * set the buffer to be written out again after - * some delay. - */ - /* We actually overwrite the existing b-relse - function at times, but we're gonna be shutting down - anyway. */ - XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + /* + * If the write was asynchronous then noone will be looking for the + * error. Clear the error state and write the buffer out again. + * + * During sync or umount we'll write all pending buffers again + * synchronous, which will catch these errors if they keep hanging + * around. + */ + if (XFS_BUF_ISASYNC(bp)) { + XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + + if (!XFS_BUF_ISSTALE(bp)) { + XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_FINISH_IOWAIT(bp); + XFS_BUF_SET_START(bp); } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_relse(bp); return; } - xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); -} - -/* - * This is a callback routine attached to a buffer which gets an error - * when being written out synchronously. - */ -STATIC void -xfs_buf_error_relse( - xfs_buf_t *bp) -{ - xfs_log_item_t *lip; - xfs_mount_t *mp; - - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - mp = (xfs_mount_t *)lip->li_mountp; - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ XFS_BUF_STALE(bp); XFS_BUF_DONE(bp); XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_ERROR(bp,0); trace_xfs_buf_error_relse(bp, _RET_IP_); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - if (! XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - /* - * We have to unpin the pinned buffers so do the - * callbacks. - */ +do_callbacks: xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_SET_BRELSE_FUNC(bp,NULL); - xfs_buf_relse(bp); + xfs_buf_ioend(bp, 0); } - /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c78cc6a3d87c..4c7db74a05f7 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) } #endif /* DEBUG */ - -void -xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - -void -xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - -#ifdef DEBUG - xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); -#endif - - if (xfs_panic_mask && (xfs_panic_mask & panic_tag) - && (level & CE_ALERT)) { - level &= ~CE_ALERT; - level |= CE_PANIC; - cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); - } - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - void xfs_error_report( const char *tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index f338847f80b8..10dce5475f02 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) @@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); struct xfs_mount; -extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp, - char *fmt, va_list ap) - __attribute__ ((format (printf, 3, 0))); -extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...) - __attribute__ ((format (printf, 4, 5))); -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); - extern void xfs_hex_dump(void *p, int length); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) + do { \ + if (!(f & XFS_MFSI_QUIET)) \ + cmn_err(CE_WARN, "XFS: " fmt, ## args); \ + } while (0) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f56d30e8040c..cec89dd5d7d2 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -612,12 +612,13 @@ out: * * We cannot use an inode here for this - that will push dirty state back up * into the VFS and then periodic inode flushing will prevent log covering from - * making progress. Hence we log a field in the superblock instead. + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. */ int xfs_fs_log_dummy( - xfs_mount_t *mp, - int flags) + xfs_mount_t *mp) { xfs_trans_t *tp; int error; @@ -632,8 +633,7 @@ xfs_fs_log_dummy( /* log the UUID because it is an unchanging field */ xfs_mod_sb(tp, XFS_SB_UUID); - if (flags & SYNC_WAIT) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index a786c5212c1e..1b6a98b66886 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0bf24b11d0c4..ae6fef1ff563 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -377,7 +377,7 @@ xfs_log_mount( cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); else { cmn_err(CE_NOTE, - "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", + "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", mp->m_fsname); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 204d8e5fa7fa..aa0ebb776903 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3800,7 +3800,7 @@ xlog_recover_finish( log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", + "Ending clean XFS mount for filesystem: %s\n", log->l_mp->m_fsname); } return 0; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f80a067a4658..33dbc4e0ad62 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1137,7 +1137,7 @@ out_undo_fdblocks: if (blkdelta) xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); out: - ASSERT(error = 0); + ASSERT(error == 0); return; } |