diff options
author | James Bottomley <jejb@mulgrave.il.steeleye.com> | 2006-06-10 22:47:26 +0400 |
---|---|---|
committer | James Bottomley <jejb@mulgrave.il.steeleye.com> | 2006-06-10 22:47:26 +0400 |
commit | f0cd91a68acdc9b49d7f6738b514a426da627649 (patch) | |
tree | 8ad73564015794197583b094217ae0a71e71e753 /fs | |
parent | 60eef25701d25e99c991dd0f4a9f3832a0c3ad3e (diff) | |
parent | 128e6ced247cda88f96fa9f2e4ba8b2c4a681560 (diff) | |
download | linux-f0cd91a68acdc9b49d7f6738b514a426da627649.tar.xz |
Merge ../linux-2.6
Diffstat (limited to 'fs')
82 files changed, 2101 insertions, 915 deletions
diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c index 71742ba150c4..6f2617820a4e 100644 --- a/fs/9p/fcall.c +++ b/fs/9p/fcall.c @@ -98,23 +98,20 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) { - int fid; + int fid, id; struct v9fs_session_info *v9ses; - if (err) - return; - + id = 0; fid = tc->params.tclunk.fid; - kfree(tc); - - if (!rc) - return; - - v9ses = a; - if (rc->id == RCLUNK) - v9fs_put_idpool(fid, &v9ses->fidpool); + if (rc) + id = rc->id; + kfree(tc); kfree(rc); + if (id == RCLUNK) { + v9ses = a; + v9fs_put_idpool(fid, &v9ses->fidpool); + } } /** diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 3e5b124a7212..f4407eb276c7 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -50,15 +50,23 @@ enum { Wpending = 8, /* can write */ }; +enum { + None, + Flushing, + Flushed, +}; + struct v9fs_mux_poll_task; struct v9fs_req { + spinlock_t lock; int tag; struct v9fs_fcall *tcall; struct v9fs_fcall *rcall; int err; v9fs_mux_req_callback cb; void *cba; + int flush; struct list_head req_list; }; @@ -96,8 +104,8 @@ struct v9fs_mux_poll_task { struct v9fs_mux_rpc { struct v9fs_mux_data *m; - struct v9fs_req *req; int err; + struct v9fs_fcall *tcall; struct v9fs_fcall *rcall; wait_queue_head_t wqueue; }; @@ -524,10 +532,9 @@ again: static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) { - int ecode, tag; + int ecode; struct v9fs_str *ename; - tag = req->tag; if (!req->err && req->rcall->id == RERROR) { ecode = req->rcall->params.rerror.errno; ename = &req->rcall->params.rerror.error; @@ -553,23 +560,6 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) if (!req->err) req->err = -EIO; } - - if (req->err == ERREQFLUSH) - return; - - if (req->cb) { - dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", - req->tcall, req->rcall); - - (*req->cb) (req->cba, req->tcall, req->rcall, req->err); - req->cb = NULL; - } else - kfree(req->rcall); - - v9fs_mux_put_tag(m, tag); - - wake_up(&m->equeue); - kfree(req); } /** @@ -669,17 +659,26 @@ static void v9fs_read_work(void *a) list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { if (rreq->tag == rcall->tag) { req = rreq; - req->rcall = rcall; - list_del(&req->req_list); - spin_unlock(&m->lock); - process_request(m, req); + if (req->flush != Flushing) + list_del(&req->req_list); break; } - } + spin_unlock(&m->lock); - if (!req) { - spin_unlock(&m->lock); + if (req) { + req->rcall = rcall; + process_request(m, req); + + if (req->flush != Flushing) { + if (req->cb) + (*req->cb) (req, req->cba); + else + kfree(req->rcall); + + wake_up(&m->equeue); + } + } else { if (err >= 0 && rcall->id != RFLUSH) dprintk(DEBUG_ERROR, "unexpected response mux %p id %d tag %d\n", @@ -746,7 +745,6 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, return ERR_PTR(-ENOMEM); v9fs_set_tag(tc, n); - if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) { char buf[150]; @@ -754,12 +752,14 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, printk(KERN_NOTICE "<<< %p %s\n", m, buf); } + spin_lock_init(&req->lock); req->tag = n; req->tcall = tc; req->rcall = NULL; req->err = 0; req->cb = cb; req->cba = cba; + req->flush = None; spin_lock(&m->lock); list_add_tail(&req->req_list, &m->unsent_req_list); @@ -776,72 +776,108 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, return req; } -static void v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, - struct v9fs_fcall *rc, int err) +static void v9fs_mux_free_request(struct v9fs_mux_data *m, struct v9fs_req *req) +{ + v9fs_mux_put_tag(m, req->tag); + kfree(req); +} + +static void v9fs_mux_flush_cb(struct v9fs_req *freq, void *a) { v9fs_mux_req_callback cb; int tag; struct v9fs_mux_data *m; - struct v9fs_req *req, *rptr; + struct v9fs_req *req, *rreq, *rptr; m = a; - dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc, - rc, err, tc->params.tflush.oldtag); + dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, + freq->tcall, freq->rcall, freq->err, + freq->tcall->params.tflush.oldtag); spin_lock(&m->lock); cb = NULL; - tag = tc->params.tflush.oldtag; - list_for_each_entry_safe(req, rptr, &m->req_list, req_list) { - if (req->tag == tag) { + tag = freq->tcall->params.tflush.oldtag; + req = NULL; + list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { + if (rreq->tag == tag) { + req = rreq; list_del(&req->req_list); - if (req->cb) { - cb = req->cb; - req->cb = NULL; - spin_unlock(&m->lock); - (*cb) (req->cba, req->tcall, req->rcall, - req->err); - } - kfree(req); - wake_up(&m->equeue); break; } } + spin_unlock(&m->lock); - if (!cb) - spin_unlock(&m->lock); + if (req) { + spin_lock(&req->lock); + req->flush = Flushed; + spin_unlock(&req->lock); + + if (req->cb) + (*req->cb) (req, req->cba); + else + kfree(req->rcall); + + wake_up(&m->equeue); + } - v9fs_mux_put_tag(m, tag); - kfree(tc); - kfree(rc); + kfree(freq->tcall); + kfree(freq->rcall); + v9fs_mux_free_request(m, freq); } -static void +static int v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req) { struct v9fs_fcall *fc; + struct v9fs_req *rreq, *rptr; dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag); + /* if a response was received for a request, do nothing */ + spin_lock(&req->lock); + if (req->rcall || req->err) { + spin_unlock(&req->lock); + dprintk(DEBUG_MUX, "mux %p req %p response already received\n", m, req); + return 0; + } + + req->flush = Flushing; + spin_unlock(&req->lock); + + spin_lock(&m->lock); + /* if the request is not sent yet, just remove it from the list */ + list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) { + if (rreq->tag == req->tag) { + dprintk(DEBUG_MUX, "mux %p req %p request is not sent yet\n", m, req); + list_del(&rreq->req_list); + req->flush = Flushed; + spin_unlock(&m->lock); + if (req->cb) + (*req->cb) (req, req->cba); + return 0; + } + } + spin_unlock(&m->lock); + + clear_thread_flag(TIF_SIGPENDING); fc = v9fs_create_tflush(req->tag); v9fs_send_request(m, fc, v9fs_mux_flush_cb, m); + return 1; } static void -v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) +v9fs_mux_rpc_cb(struct v9fs_req *req, void *a) { struct v9fs_mux_rpc *r; - if (err == ERREQFLUSH) { - kfree(rc); - dprintk(DEBUG_MUX, "err req flush\n"); - return; - } - + dprintk(DEBUG_MUX, "req %p r %p\n", req, a); r = a; - dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req, - tc, rc, err); - r->rcall = rc; - r->err = err; + r->rcall = req->rcall; + r->err = req->err; + + if (req->flush!=None && !req->err) + r->err = -ERESTARTSYS; + wake_up(&r->wqueue); } @@ -856,12 +892,13 @@ int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc) { - int err; + int err, sigpending; unsigned long flags; struct v9fs_req *req; struct v9fs_mux_rpc r; r.err = 0; + r.tcall = tc; r.rcall = NULL; r.m = m; init_waitqueue_head(&r.wqueue); @@ -869,48 +906,50 @@ v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, if (rc) *rc = NULL; + sigpending = 0; + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } + req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r); if (IS_ERR(req)) { err = PTR_ERR(req); dprintk(DEBUG_MUX, "error %d\n", err); - return PTR_ERR(req); + return err; } - r.req = req; - dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc, - req->tag, &r, req); err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0); if (r.err < 0) err = r.err; if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) { - spin_lock(&m->lock); - req->tcall = NULL; - req->err = ERREQFLUSH; - spin_unlock(&m->lock); + if (v9fs_mux_flush_request(m, req)) { + /* wait until we get response of the flush message */ + do { + clear_thread_flag(TIF_SIGPENDING); + err = wait_event_interruptible(r.wqueue, + r.rcall || r.err); + } while (!r.rcall && !r.err && err==-ERESTARTSYS && + m->trans->status==Connected && !m->err); + } + sigpending = 1; + } - clear_thread_flag(TIF_SIGPENDING); - v9fs_mux_flush_request(m, req); + if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - if (!err) { - if (r.rcall) - dprintk(DEBUG_MUX, "got response id %d tag %d\n", - r.rcall->id, r.rcall->tag); - - if (rc) - *rc = r.rcall; - else - kfree(r.rcall); - } else { + if (rc) + *rc = r.rcall; + else kfree(r.rcall); - dprintk(DEBUG_MUX, "got error %d\n", err); - if (err > 0) - err = -EIO; - } + + v9fs_mux_free_request(m, req); + if (err > 0) + err = -EIO; return err; } @@ -951,12 +990,15 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err) struct v9fs_req *req, *rtmp; LIST_HEAD(cancel_list); - dprintk(DEBUG_MUX, "mux %p err %d\n", m, err); + dprintk(DEBUG_ERROR, "mux %p err %d\n", m, err); m->err = err; spin_lock(&m->lock); list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); } + list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { + list_move(&req->req_list, &cancel_list); + } spin_unlock(&m->lock); list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { @@ -965,11 +1007,9 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err) req->err = err; if (req->cb) - (*req->cb) (req->cba, req->tcall, req->rcall, req->err); + (*req->cb) (req, req->cba); else kfree(req->rcall); - - kfree(req); } wake_up(&m->equeue); diff --git a/fs/9p/mux.h b/fs/9p/mux.h index e90bfd32ea42..fb10c50186a1 100644 --- a/fs/9p/mux.h +++ b/fs/9p/mux.h @@ -24,6 +24,7 @@ */ struct v9fs_mux_data; +struct v9fs_req; /** * v9fs_mux_req_callback - callback function that is called when the @@ -36,8 +37,7 @@ struct v9fs_mux_data; * @rc - response call * @err - error code (non-zero if error occured) */ -typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc, - struct v9fs_fcall *rc, int err); +typedef void (*v9fs_mux_req_callback)(struct v9fs_req *req, void *a); int v9fs_mux_global_init(void); void v9fs_mux_global_exit(void); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 083dcfcd158e..1a8e46084f0e 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -72,11 +72,17 @@ int v9fs_file_open(struct inode *inode, struct file *file) return -ENOSPC; } - err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL); + err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, &fcall); if (err < 0) { dprintk(DEBUG_ERROR, "rewalk didn't work\n"); - goto put_fid; + if (fcall && fcall->id == RWALK) + goto clunk_fid; + else { + v9fs_put_idpool(fid, &v9ses->fidpool); + goto free_fcall; + } } + kfree(fcall); /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */ /* translate open mode appropriately */ @@ -109,8 +115,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) clunk_fid: v9fs_t_clunk(v9ses, fid); -put_fid: - v9fs_put_idpool(fid, &v9ses->fidpool); +free_fcall: kfree(fcall); return err; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 133db366d306..2cb87ba4b1c1 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -270,7 +270,10 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, u32 perm, err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); if (err < 0) { PRINT_FCALL_ERROR("clone error", fcall); - goto put_fid; + if (fcall && fcall->id == RWALK) + goto clunk_fid; + else + goto put_fid; } kfree(fcall); @@ -322,6 +325,9 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) &fcall); if (err < 0) { + if (fcall && fcall->id == RWALK) + goto clunk_fid; + PRINT_FCALL_ERROR("walk error", fcall); v9fs_put_idpool(nfid, &v9ses->fidpool); goto error; @@ -640,19 +646,26 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, } result = v9fs_t_walk(v9ses, dirfidnum, newfid, - (char *)dentry->d_name.name, NULL); + (char *)dentry->d_name.name, &fcall); + if (result < 0) { - v9fs_put_idpool(newfid, &v9ses->fidpool); + if (fcall && fcall->id == RWALK) + v9fs_t_clunk(v9ses, newfid); + else + v9fs_put_idpool(newfid, &v9ses->fidpool); + if (result == -ENOENT) { d_add(dentry, NULL); dprintk(DEBUG_VFS, "Return negative dentry %p count %d\n", dentry, atomic_read(&dentry->d_count)); + kfree(fcall); return NULL; } dprintk(DEBUG_ERROR, "walk error:%d\n", result); goto FreeFcall; } + kfree(fcall); result = v9fs_t_stat(v9ses, newfid, &fcall); if (result < 0) { diff --git a/fs/Kconfig b/fs/Kconfig index 2524629dc835..f9b5842c8d2d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -842,6 +842,12 @@ config TMPFS config HUGETLBFS bool "HugeTLB file system support" depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + <file:Documentation/vm/hugetlbpage.txt> for details. + + If unsure, say N. config HUGETLB_PAGE def_bool HUGETLBFS diff --git a/fs/Makefile b/fs/Makefile index 83bf478e786b..078d3d1191a5 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_DNOTIFY) += dnotify.o obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ obj-$(CONFIG_PROFILING) += dcookies.o @@ -100,5 +101,4 @@ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ -obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d4c2d636c479..a42143ca0169 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -416,10 +416,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, return retval; } - retval = -EIO; bh = affs_bread(sb, old_dentry->d_inode->i_ino); if (!bh) - goto done; + return -EIO; /* Remove header from its parent directory. */ affs_lock_dir(old_dir); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 57c4903614e5..d6603d02304c 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -74,8 +74,8 @@ struct autofs_wait_queue { struct autofs_wait_queue *next; autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ - int hash; - int len; + unsigned int hash; + unsigned int len; char *name; u32 dev; u64 ino; @@ -85,7 +85,6 @@ struct autofs_wait_queue { pid_t tgid; /* This is for status reporting upon return */ int status; - atomic_t notify; atomic_t wait_ctr; }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 84e030c8ddd0..5100f984783f 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -327,6 +327,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); int oz_mode = autofs4_oz_mode(sbi); unsigned int lookup_type; int status; @@ -340,13 +341,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) if (oz_mode || !lookup_type) goto done; - /* - * If a request is pending wait for it. - * If it's a mount then it won't be expired till at least - * a liitle later and if it's an expire then we might need - * to mount it again. - */ - if (autofs4_ispending(dentry)) { + /* If an expire request is pending wait for it. */ + if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { DPRINTK("waiting for active request %p name=%.*s", dentry, dentry->d_name.len, dentry->d_name.name); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 142ab6aa2aa1..ce103e7b0bc3 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -189,14 +189,30 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, return len; } +static struct autofs_wait_queue * +autofs4_find_wait(struct autofs_sb_info *sbi, + char *name, unsigned int hash, unsigned int len) +{ + struct autofs_wait_queue *wq; + + for (wq = sbi->queues; wq; wq = wq->next) { + if (wq->hash == hash && + wq->len == len && + wq->name && !memcmp(wq->name, name, len)) + break; + } + return wq; +} + int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, enum autofs_notify notify) { + struct autofs_info *ino; struct autofs_wait_queue *wq; char *name; unsigned int len = 0; unsigned int hash = 0; - int status; + int status, type; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) @@ -223,21 +239,41 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -EINTR; } - for (wq = sbi->queues ; wq ; wq = wq->next) { - if (wq->hash == dentry->d_name.hash && - wq->len == len && - wq->name && !memcmp(wq->name, name, len)) - break; - } + wq = autofs4_find_wait(sbi, name, hash, len); + ino = autofs4_dentry_ino(dentry); + if (!wq && ino && notify == NFY_NONE) { + /* + * Either we've betean the pending expire to post it's + * wait or it finished while we waited on the mutex. + * So we need to wait till either, the wait appears + * or the expire finishes. + */ + + while (ino->flags & AUTOFS_INF_EXPIRING) { + mutex_unlock(&sbi->wq_mutex); + schedule_timeout_interruptible(HZ/10); + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(name); + return -EINTR; + } + wq = autofs4_find_wait(sbi, name, hash, len); + if (wq) + break; + } - if (!wq) { - /* Can't wait for an expire if there's no mount */ - if (notify == NFY_NONE && !d_mountpoint(dentry)) { + /* + * Not ideal but the status has already gone. Of the two + * cases where we wait on NFY_NONE neither depend on the + * return status of the wait. + */ + if (!wq) { kfree(name); mutex_unlock(&sbi->wq_mutex); - return -ENOENT; + return 0; } + } + if (!wq) { /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); if (!wq) { @@ -263,20 +299,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ atomic_set(&wq->wait_ctr, 2); - atomic_set(&wq->notify, 1); - mutex_unlock(&sbi->wq_mutex); - } else { - atomic_inc(&wq->wait_ctr); mutex_unlock(&sbi->wq_mutex); - kfree(name); - DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); - } - - if (notify != NFY_NONE && atomic_read(&wq->notify)) { - int type; - - atomic_dec(&wq->notify); if (sbi->version < 5) { if (notify == NFY_MOUNT) @@ -299,6 +322,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, /* autofs4_notify_daemon() may block */ autofs4_notify_daemon(sbi, wq, type); + } else { + atomic_inc(&wq->wait_ctr); + mutex_unlock(&sbi->wq_mutex); + kfree(name); + DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", + (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); } /* wq->name is NULL if and only if the lock is already released */ diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 69f44dcdb0b4..b1c902e319c1 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -428,7 +428,6 @@ static int load_flat_file(struct linux_binprm * bprm, loff_t fpos; unsigned long start_code, end_code; int ret; - int exec_fileno; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ inode = bprm->file->f_dentry->d_inode; @@ -502,21 +501,12 @@ static int load_flat_file(struct linux_binprm * bprm, goto err; } - /* check file descriptor */ - exec_fileno = get_unused_fd(); - if (exec_fileno < 0) { - ret = -EMFILE; - goto err; - } - get_file(bprm->file); - fd_install(exec_fileno, bprm->file); - /* Flush all traces of the currently running executable */ if (id == 0) { result = flush_old_exec(bprm); if (result) { ret = result; - goto err_close; + goto err; } /* OK, This is the point of no return */ @@ -548,7 +538,7 @@ static int load_flat_file(struct linux_binprm * bprm, textpos = (unsigned long) -ENOMEM; printk("Unable to mmap process text, errno %d\n", (int)-textpos); ret = textpos; - goto err_close; + goto err; } down_write(¤t->mm->mmap_sem); @@ -564,7 +554,7 @@ static int load_flat_file(struct linux_binprm * bprm, (int)-datapos); do_munmap(current->mm, textpos, text_len); ret = realdatastart; - goto err_close; + goto err; } datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); @@ -587,7 +577,7 @@ static int load_flat_file(struct linux_binprm * bprm, do_munmap(current->mm, textpos, text_len); do_munmap(current->mm, realdatastart, data_len + extra); ret = result; - goto err_close; + goto err; } reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); @@ -606,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm, printk("Unable to allocate RAM for process text/data, errno %d\n", (int)-textpos); ret = textpos; - goto err_close; + goto err; } realdatastart = textpos + ntohl(hdr->data_start); @@ -652,7 +642,7 @@ static int load_flat_file(struct linux_binprm * bprm, do_munmap(current->mm, textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); ret = result; - goto err_close; + goto err; } } @@ -717,7 +707,7 @@ static int load_flat_file(struct linux_binprm * bprm, addr = calc_reloc(*rp, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; - goto err_close; + goto err; } *rp = addr; } @@ -747,7 +737,7 @@ static int load_flat_file(struct linux_binprm * bprm, rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); if (rp == (unsigned long *)RELOC_FAILED) { ret = -ENOEXEC; - goto err_close; + goto err; } /* Get the pointer's value. */ @@ -762,7 +752,7 @@ static int load_flat_file(struct linux_binprm * bprm, addr = calc_reloc(addr, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; - goto err_close; + goto err; } /* Write back the relocated pointer. */ @@ -783,8 +773,6 @@ static int load_flat_file(struct linux_binprm * bprm, stack_len); return 0; -err_close: - sys_close(exec_fileno); err: return ret; } @@ -1116,6 +1116,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio1.bi_io_vec = &bp->bv1; bp->bio2.bi_io_vec = &bp->bv2; + bp->bio1.bi_max_vecs = 1; + bp->bio2.bi_max_vecs = 1; + bp->bio1.bi_end_io = bio_pair_end_1; bp->bio2.bi_end_io = bio_pair_end_2; diff --git a/fs/block_dev.c b/fs/block_dev.c index af88c43043d5..f5958f413bd1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1104,6 +1104,8 @@ const struct file_operations def_blk_fops = { .readv = generic_file_readv, .writev = generic_file_write_nolock, .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 8a2de038882e..7271bb0257f6 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,7 +1,18 @@ +Version 1.43 +------------ +POSIX locking to servers which support CIFS POSIX Extensions +(disabled by default controlled by proc/fs/cifs/Experimental). +Handle conversion of long share names (especially Asian languages) +to Unicode during mount. + Version 1.42 ------------ Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. +the tids match and we try to find matching fid on wrong server. Fix read +looping when signing required by server (2.6.16 kernel only). Fix readdir +vs. rename race which could cause each to hang. Return . and .. even +if server does not. Allow searches to skip first three entries and +begin at any location. Fix oops in find_writeable_file. Version 1.41 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index b2b4d0803761..0355003f4f0a 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -511,6 +511,14 @@ LinuxExtensionsEnabled If set to one then the client will attempt to support and want to map the uid and gid fields to values supplied at mount (rather than the actual values, then set this to zero. (default 1) +Experimental When set to 1 used to enable certain experimental + features (currently enables multipage writes + when signing is enabled, the multipage write + performance enhancement was disabled when + signing turned on in case buffer was modified + just before it was sent, also this flag will + be used to use the new experimental sessionsetup + code). These experimental features and tracing can be enabled by changing flags in /proc/fs/cifs (after the cifs module has been installed or built into the diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d4b713e5affb..c262d8874ce9 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -33,6 +33,7 @@ #include <linux/vfs.h> #include <linux/mempool.h> #include <linux/delay.h> +#include <linux/kthread.h> #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE @@ -75,9 +76,6 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, int, 0); MODULE_PARM_DESC(cifs_max_pending,"Simultaneous requests to server. Default: 50 Range: 2 to 256"); -static DECLARE_COMPLETION(cifs_oplock_exited); -static DECLARE_COMPLETION(cifs_dnotify_exited); - extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -841,10 +839,6 @@ static int cifs_oplock_thread(void * dummyarg) __u16 netfid; int rc; - daemonize("cifsoplockd"); - allow_signal(SIGTERM); - - oplockThread = current; do { if (try_to_freeze()) continue; @@ -900,9 +894,9 @@ static int cifs_oplock_thread(void * dummyarg) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); /* yield in case q were corrupt */ } - } while(!signal_pending(current)); - oplockThread = NULL; - complete_and_exit (&cifs_oplock_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int cifs_dnotify_thread(void * dummyarg) @@ -910,10 +904,6 @@ static int cifs_dnotify_thread(void * dummyarg) struct list_head *tmp; struct cifsSesInfo *ses; - daemonize("cifsdnotifyd"); - allow_signal(SIGTERM); - - dnotifyThread = current; do { if(try_to_freeze()) continue; @@ -931,8 +921,9 @@ static int cifs_dnotify_thread(void * dummyarg) wake_up_all(&ses->server->response_q); } read_unlock(&GlobalSMBSeslock); - } while(!signal_pending(current)); - complete_and_exit (&cifs_dnotify_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int __init @@ -982,32 +973,48 @@ init_cifs(void) } rc = cifs_init_inodecache(); - if (!rc) { - rc = cifs_init_mids(); - if (!rc) { - rc = cifs_init_request_bufs(); - if (!rc) { - rc = register_filesystem(&cifs_fs_type); - if (!rc) { - rc = (int)kernel_thread(cifs_oplock_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) { - rc = (int)kernel_thread(cifs_dnotify_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) - return 0; - else - cERROR(1,("error %d create dnotify thread", rc)); - } else { - cERROR(1,("error %d create oplock thread",rc)); - } - } - cifs_destroy_request_bufs(); - } - cifs_destroy_mids(); - } - cifs_destroy_inodecache(); + if (rc) + goto out_clean_proc; + + rc = cifs_init_mids(); + if (rc) + goto out_destroy_inodecache; + + rc = cifs_init_request_bufs(); + if (rc) + goto out_destroy_mids; + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_destroy_request_bufs; + + oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); + if (IS_ERR(oplockThread)) { + rc = PTR_ERR(oplockThread); + cERROR(1,("error %d create oplock thread", rc)); + goto out_unregister_filesystem; } + + dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd"); + if (IS_ERR(dnotifyThread)) { + rc = PTR_ERR(dnotifyThread); + cERROR(1,("error %d create dnotify thread", rc)); + goto out_stop_oplock_thread; + } + + return 0; + + out_stop_oplock_thread: + kthread_stop(oplockThread); + out_unregister_filesystem: + unregister_filesystem(&cifs_fs_type); + out_destroy_request_bufs: + cifs_destroy_request_bufs(); + out_destroy_mids: + cifs_destroy_mids(); + out_destroy_inodecache: + cifs_destroy_inodecache(); + out_clean_proc: #ifdef CONFIG_PROC_FS cifs_proc_clean(); #endif @@ -1025,14 +1032,8 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_destroy_mids(); cifs_destroy_request_bufs(); - if(oplockThread) { - send_sig(SIGTERM, oplockThread, 1); - wait_for_completion(&cifs_oplock_exited); - } - if(dnotifyThread) { - send_sig(SIGTERM, dnotifyThread, 1); - wait_for_completion(&cifs_dnotify_exited); - } + kthread_stop(oplockThread); + kthread_stop(dnotifyThread); } MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 4e829dc672a6..c98755dca868 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -99,5 +99,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.42" +#define CIFS_VERSION "1.43" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 2879ba343ca7..310ea2f0e0bf 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -267,7 +267,7 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const int waitFlag); extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const int get_flag, - const __u64 len, const __u64 offset, + const __u64 len, struct file_lock *, const __u16 lock_type, const int waitFlag); extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d705500aa283..925881e00ff2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1355,7 +1355,8 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const int get_flag, const __u64 len, - const __u64 lkoffset, const __u16 lock_type, const int waitFlag) + struct file_lock *pLockData, const __u16 lock_type, + const int waitFlag) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; @@ -1366,6 +1367,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, __u16 params, param_offset, offset, byte_count, count; cFYI(1, ("Posix Lock")); + + if(pLockData == NULL) + return EINVAL; + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -1404,10 +1409,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, parm_data->lock_type = cpu_to_le16(lock_type); if(waitFlag) - parm_data->lock_flags = 1; + parm_data->lock_flags = cpu_to_le16(1); parm_data->pid = cpu_to_le32(current->tgid); - parm_data->start = lkoffset; - parm_data->length = len; /* normalize negative numbers */ + parm_data->start = cpu_to_le64(pLockData->fl_start); + parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ pSMB->DataOffset = cpu_to_le16(offset); pSMB->Fid = smb_file_id; @@ -1419,8 +1424,33 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { cFYI(1, ("Send error in Posix Lock = %d", rc)); - } + } else if (get_flag) { + /* lock structure can be returned on get */ + __u16 data_offset; + __u16 data_count; + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) { + rc = -EIO; /* bad smb */ + goto plk_err_exit; + } + if(pLockData == NULL) { + rc = -EINVAL; + goto plk_err_exit; + } + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + data_count = le16_to_cpu(pSMBr->t2.DataCount); + if(data_count < sizeof(struct cifs_posix_lock)) { + rc = -EIO; + goto plk_err_exit; + } + parm_data = (struct cifs_posix_lock *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if(parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) + pLockData->fl_type = F_UNLCK; + } + +plk_err_exit: if (pSMB) cifs_small_buf_release(pSMB); @@ -3119,7 +3149,7 @@ findFirstRetry: psrch_inf->endOfSearch = FALSE; psrch_inf->entries_in_buffer = le16_to_cpu(parms->SearchCount); - psrch_inf->index_of_last_entry = + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; *pnetfid = parms->SearchHandle; } else { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0b86d5ca9014..bae1479318d1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2148,6 +2148,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); if(ses->serverOS == NULL) goto sesssetup_nomem; @@ -2160,6 +2162,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *)bcc_ptr, remaining_words-1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL); if(ses->serverNOS == NULL) goto sesssetup_nomem; @@ -2177,6 +2181,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL); if(ses->serverDomain == NULL) @@ -2187,15 +2193,22 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, ses->serverDomain[2*len] = 0; ses->serverDomain[1+(2*len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + } } else { /* no room so create dummy domain and NOS string */ /* if these kcallocs fail not much we can do, but better to not fail the sesssetup itself */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } @@ -2204,6 +2217,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1,GFP_KERNEL); if(ses->serverOS == NULL) goto sesssetup_nomem; @@ -2214,6 +2229,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1,GFP_KERNEL); if(ses->serverNOS == NULL) goto sesssetup_nomem; @@ -2223,6 +2240,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1,GFP_KERNEL); if(ses->serverDomain == NULL) goto sesssetup_nomem; @@ -2427,6 +2446,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -2441,6 +2462,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, len = UniStrnlen((wchar_t *)bcc_ptr, remaining_words - 1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -2454,7 +2477,9 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, remaining_words -= len + 1; if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); - /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ + /* last string not null terminated (e.g.Windows XP/2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL); cifs_strfromUCS_le(ses->serverDomain, (__le16 *)bcc_ptr, @@ -2463,11 +2488,18 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, ses->serverDomain[2*len] = 0; ses->serverDomain[1+(2*len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2,GFP_KERNEL); - } else { /* no room so create dummy domain and NOS string */ + } + } else {/* no room use dummy domain&NOS */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } } else { /* ASCII */ @@ -2476,6 +2508,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); strncpy(ses->serverOS, bcc_ptr, len); @@ -2484,6 +2518,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; @@ -2491,6 +2527,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; @@ -2728,6 +2766,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -2743,6 +2783,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr, remaining_words - 1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -2760,6 +2802,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2 * (len + @@ -2777,13 +2821,20 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, [1 + (2 * len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + } } else { /* no room so create dummy domain and NOS string */ + if(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } @@ -2792,6 +2843,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); @@ -2803,6 +2856,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); @@ -2812,6 +2867,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); @@ -3116,6 +3173,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -3131,6 +3190,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr, remaining_words - 1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -3147,6 +3208,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string not always null terminated (e.g. for Windows XP & 2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2 * (len + @@ -3172,10 +3235,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2,GFP_KERNEL); + } } else { /* no room so create dummy domain and NOS string */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } } else { /* ASCII */ @@ -3183,6 +3253,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1,GFP_KERNEL); strncpy(ses->serverOS,bcc_ptr, len); @@ -3191,6 +3263,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len+1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; @@ -3198,6 +3272,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len+1,GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; @@ -3282,7 +3358,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; /* align */ } - if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if(ses->server->secMode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_STATUS32) { @@ -3294,8 +3371,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, if (ses->capabilities & CAP_UNICODE) { smb_buffer->Flags2 |= SMBFLG2_UNICODE; length = - cifs_strtoUCS((__le16 *) bcc_ptr, tree, 100, nls_codepage); - bcc_ptr += 2 * length; /* convert num of 16 bit words to bytes */ + cifs_strtoUCS((__le16 *) bcc_ptr, tree, + 6 /* max utf8 char length in bytes */ * + (/* server len*/ + 256 /* share len */), nls_codepage); + bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */ bcc_ptr += 2; /* skip trailing null */ } else { /* ASCII */ strcpy(bcc_ptr, tree); @@ -3447,6 +3526,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); +#ifdef CONFIG_CIFS_EXPERIMENTAL + if(experimEnabled > 1) + rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, + &ntlmv2_flag, nls_info); + else +#endif if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1d0ca3eaaca5..82315edc77d7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -139,9 +139,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -316,9 +314,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) rc = -ENOMEM; else if (pTcon->ses->capabilities & CAP_UNIX) { @@ -440,6 +436,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name cifs_sb = CIFS_SB(parent_dir_inode->i_sb); pTcon = cifs_sb->tcon; + /* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + int i; + for (i = 0; i < direntry->d_name.len; i++) + if (direntry->d_name.name[i] == '\\') { + cFYI(1, ("Invalid file name")); + FreeXid(xid); + return ERR_PTR(-EINVAL); + } + } + /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c index ec4dfe9bf5ef..633a93811328 100644 --- a/fs/cifs/fcntl.c +++ b/fs/cifs/fcntl.c @@ -86,9 +86,7 @@ int cifs_dir_notify(struct file * file, unsigned long arg) cifs_sb = CIFS_SB(file->f_dentry->d_sb); pTcon = cifs_sb->tcon; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5c497c529772..e2b4ce1dad66 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -84,6 +84,8 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OVERWRITE_IF; else if ((flags & O_CREAT) == O_CREAT) return FILE_OPEN_IF; + else if ((flags & O_TRUNC) == O_TRUNC) + return FILE_OVERWRITE; else return FILE_OPEN; } @@ -203,9 +205,7 @@ int cifs_open(struct inode *inode, struct file *file) } } - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -658,7 +658,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) else posix_lock_type = CIFS_WRLCK; rc = CIFSSMBPosixLock(xid, pTcon, netfid, 1 /* get */, - length, pfLock->fl_start, + length, pfLock, posix_lock_type, wait_flag); FreeXid(xid); return rc; @@ -706,7 +706,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) return -EOPNOTSUPP; } rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */, - length, pfLock->fl_start, + length, pfLock, posix_lock_type, wait_flag); } else rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, @@ -906,9 +906,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (rc != 0) break; } - /* BB FIXME We can not sign across two buffers yet */ - if((pTcon->ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) { + if(experimEnabled || (pTcon->ses->server && + ((pTcon->ses->server->secMode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + == 0))) { struct kvec iov[2]; unsigned int len; @@ -923,13 +924,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data, *poffset, &bytes_written, iov, 1, long_op); } else - /* BB FIXME fixup indentation of line below */ - rc = CIFSSMBWrite(xid, pTcon, - open_file->netfid, - min_t(const int, cifs_sb->wsize, - write_size - total_written), - *poffset, &bytes_written, - write_data + total_written, NULL, long_op); + rc = CIFSSMBWrite(xid, pTcon, + open_file->netfid, + min_t(const int, cifs_sb->wsize, + write_size - total_written), + *poffset, &bytes_written, + write_data + total_written, + NULL, long_op); } if (rc || (bytes_written == 0)) { if (total_written) @@ -968,6 +969,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) struct cifsFileInfo *open_file; int rc; + /* Having a null inode here (because mapping->host was set to zero by + the VFS or MM) should not happen but we had reports of on oops (due to + it being zero) during stress testcases so we need to check for it */ + + if(cifs_inode == NULL) { + cERROR(1,("Null inode passed to cifs_writeable_file")); + dump_stack(); + return NULL; + } + read_lock(&GlobalSMBSeslock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (open_file->closePend) @@ -1093,12 +1104,11 @@ static int cifs_writepages(struct address_space *mapping, if (cifs_sb->wsize < PAGE_CACHE_SIZE) return generic_writepages(mapping, wbc); - /* BB FIXME we do not have code to sign across multiple buffers yet, - so go to older writepage style write which we can sign if needed */ if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) if(cifs_sb->tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - return generic_writepages(mapping, wbc); + if(!experimEnabled) + return generic_writepages(mapping, wbc); /* * BB: Is this meaningful for a non-block-device file system? diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 957ddd1571c6..4093764ef461 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -722,9 +722,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -807,9 +805,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -1141,9 +1137,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) rc = 0; } - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 9562f5bba65c..2ec99f833142 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -48,10 +48,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, /* No need to check for cross device links since server will do that BB note DFS case in future though (when we may have to check) */ - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); fromName = build_path_from_dentry(old_file); toName = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if((fromName == NULL) || (toName == NULL)) { rc = -ENOMEM; goto cifs_hl_exit; @@ -103,9 +101,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) xid = GetXid(); - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (!full_path) goto out_no_free; @@ -164,9 +160,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c index 78866f925747..115359cc7a32 100644 --- a/fs/cifs/ntlmssp.c +++ b/fs/cifs/ntlmssp.c @@ -121,6 +121,20 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type, } + /* copy session key */ + + /* if Unicode, align strings to two byte boundary */ + + /* copy user name */ /* BB Do we need to special case null user name? */ + + /* copy domain name */ + + /* copy Linux version */ + + /* copy network operating system name */ + + /* update bcc and smb buffer length */ + /* rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */ /* SMB request buf freed in SendReceive2 */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2f6e2825571e..b689c5035124 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -404,9 +404,7 @@ static int initiate_cifs_search(const int xid, struct file *file) if(pTcon == NULL) return -EINVAL; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { return -ENOMEM; @@ -592,6 +590,13 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; + + /* if first entry in buf is zero then is first buffer + in search response data which means it is likely . and .. + will be in this buffer, although some servers do not return + . and .. for the root of a drive and for those we need + to start two entries earlier */ + /* dump_cifs_file_struct(file, "In fce ");*/ if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || @@ -634,23 +639,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, char * end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + smbCalcSize((struct smb_hdr *) cifsFile->srch_inf.ntwrk_buf_start); + + current_entry = cifsFile->srch_inf.srch_entries_start; first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); - current_entry = cifsFile->srch_inf.srch_entries_start; for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) { /* go entry by entry figuring out which is first */ - /* if( . or ..) - skip */ - rc = cifs_entry_is_dot(current_entry,cifsFile); - if(rc == 1) /* is . or .. so skip */ { - cFYI(1,("Entry is .")); /* BB removeme BB */ - /* continue; */ - } else if (rc == 2 ) { - cFYI(1,("Entry is ..")); /* BB removeme BB */ - /* continue; */ - } current_entry = nxt_dir_entry(current_entry,end_of_smb); } if((current_entry == NULL) && (i < pos_in_buf)) { @@ -770,6 +766,11 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(file->f_dentry == NULL) return -ENOENT; + rc = cifs_entry_is_dot(pfindEntry,pCifsF); + /* skip . and .. since we added them first */ + if(rc != 0) + return 0; + cifs_sb = CIFS_SB(file->f_dentry->d_sb); qstring.name = scratch_buf; @@ -898,22 +899,22 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) switch ((int) file->f_pos) { case 0: - /*if (filldir(direntry, ".", 1, file->f_pos, + if (filldir(direntry, ".", 1, file->f_pos, file->f_dentry->d_inode->i_ino, DT_DIR) < 0) { - cERROR(1, ("Filldir for current dir failed ")); + cERROR(1, ("Filldir for current dir failed")); rc = -ENOMEM; break; } - file->f_pos++; */ + file->f_pos++; case 1: - /* if (filldir(direntry, "..", 2, file->f_pos, + if (filldir(direntry, "..", 2, file->f_pos, file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { cERROR(1, ("Filldir for parent dir failed ")); rc = -ENOMEM; break; } - file->f_pos++; */ - case 2: + file->f_pos++; + default: /* 1) If search is active, is in current search buffer? if it before then restart search @@ -927,7 +928,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) return rc; } } - default: if(file->private_data == NULL) { rc = -EINVAL; FreeXid(xid); @@ -947,8 +947,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) kfree(cifsFile->search_resume_name); cifsFile->search_resume_name = NULL; */ - /* BB account for . and .. in f_pos as special case */ - rc = find_cifs_entry(xid,pTcon, file, ¤t_entry,&num_to_fill); if(rc) { @@ -977,7 +975,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) num_to_fill, i)); break; } - + /* if buggy server returns . and .. late do + we want to check for that here? */ rc = cifs_filldir(current_entry, file, filldir, direntry,tmp_buf); file->f_pos++; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 3938444d87b2..7754d641775e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -62,9 +62,7 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -116,9 +114,7 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -223,9 +219,7 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -341,9 +335,7 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/compat.c b/fs/compat.c index 7f8e26ea427c..b1f64786a613 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (ret < 0) goto out; + ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE); + if (ret) + goto out; + fnv = NULL; if (type == READ) { fn = file->f_op->read; @@ -1313,6 +1317,26 @@ out: return ret; } +asmlinkage long +compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, + unsigned int nr_segs, unsigned int flags) +{ + unsigned i; + struct iovec *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} + /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. @@ -1889,7 +1913,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } if (sigmask) { - if (sigsetsize |= sizeof(compat_sigset_t)) + if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (copy_from_user(&ss32, sigmask, sizeof(ss32))) return -EFAULT; @@ -2006,109 +2030,115 @@ union compat_nfsctl_res { struct knfsd_fh cr32_getfs; }; -static int compat_nfs_svc_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_svc_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, &arg->ca32_svc.svc32_nthreads); - return (err) ? -EFAULT : 0; + if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) || + get_user(karg->ca_version, &arg->ca32_version) || + __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) || + __get_user(karg->ca_svc.svc_nthreads, + &arg->ca32_svc.svc32_nthreads)) + return -EFAULT; + return 0; } -static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_client, sizeof(arg->ca32_client)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_client.cl_ident[0], - &arg->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, &arg->ca32_client.cl32_naddr); - err |= __copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg->ca32_client.cl32_fhkeylen); - err |= __copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); + if (!access_ok(VERIFY_READ, &arg->ca32_client, + sizeof(arg->ca32_client)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_client.cl_ident[0], + &arg->ca32_client.cl32_ident[0], + NFSCLNT_IDMAX) || + __get_user(karg->ca_client.cl_naddr, + &arg->ca32_client.cl32_naddr) || + __copy_from_user(&karg->ca_client.cl_addrlist[0], + &arg->ca32_client.cl32_addrlist[0], + (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) || + __get_user(karg->ca_client.cl_fhkeytype, + &arg->ca32_client.cl32_fhkeytype) || + __get_user(karg->ca_client.cl_fhkeylen, + &arg->ca32_client.cl32_fhkeylen) || + __copy_from_user(&karg->ca_client.cl_fhkey[0], + &arg->ca32_client.cl32_fhkey[0], + NFSCLNT_KEYMAX)) + return -EFAULT; - return (err) ? -EFAULT : 0; + return 0; } -static int compat_nfs_exp_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_exp_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_export, sizeof(arg->ca32_export)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_export.ex_client[0], - &arg->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= __copy_from_user(&karg->ca_export.ex_path[0], - &arg->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg->ca32_export.ex32_anon_gid); + if (!access_ok(VERIFY_READ, &arg->ca32_export, + sizeof(arg->ca32_export)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_export.ex_client[0], + &arg->ca32_export.ex32_client[0], + NFSCLNT_IDMAX) || + __copy_from_user(&karg->ca_export.ex_path[0], + &arg->ca32_export.ex32_path[0], + NFS_MAXPATHLEN) || + __get_user(karg->ca_export.ex_dev, + &arg->ca32_export.ex32_dev) || + __get_user(karg->ca_export.ex_ino, + &arg->ca32_export.ex32_ino) || + __get_user(karg->ca_export.ex_flags, + &arg->ca32_export.ex32_flags) || + __get_user(karg->ca_export.ex_anon_uid, + &arg->ca32_export.ex32_anon_uid) || + __get_user(karg->ca_export.ex_anon_gid, + &arg->ca32_export.ex32_anon_gid)) + return -EFAULT; SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); - return (err) ? -EFAULT : 0; + return 0; } -static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_getfd, sizeof(arg->ca32_getfd)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_getfd.gd_addr, - &arg->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))); - err |= __copy_from_user(&karg->ca_getfd.gd_path, - &arg->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfd.gd_version, - &arg->ca32_getfd.gd32_version); + if (!access_ok(VERIFY_READ, &arg->ca32_getfd, + sizeof(arg->ca32_getfd)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_getfd.gd_addr, + &arg->ca32_getfd.gd32_addr, + (sizeof(struct sockaddr))) || + __copy_from_user(&karg->ca_getfd.gd_path, + &arg->ca32_getfd.gd32_path, + (NFS_MAXPATHLEN+1)) || + __get_user(karg->ca_getfd.gd_version, + &arg->ca32_getfd.gd32_version)) + return -EFAULT; - return (err) ? -EFAULT : 0; + return 0; } -static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_getfs, sizeof(arg->ca32_getfs)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_getfs.gd_addr, - &arg->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))); - err |= __copy_from_user(&karg->ca_getfs.gd_path, - &arg->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfs.gd_maxlen, - &arg->ca32_getfs.gd32_maxlen); + if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_getfs.gd_addr, + &arg->ca32_getfs.gd32_addr, + (sizeof(struct sockaddr))) || + __copy_from_user(&karg->ca_getfs.gd_path, + &arg->ca32_getfs.gd32_path, + (NFS_MAXPATHLEN+1)) || + __get_user(karg->ca_getfs.gd_maxlen, + &arg->ca32_getfs.gd32_maxlen)) + return -EFAULT; - return (err) ? -EFAULT : 0; + return 0; } /* This really doesn't need translations, we are only passing * back a union which contains opaque nfs file handle data. */ -static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsctl_res __user *res) +static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, + union compat_nfsctl_res __user *res) { int err; @@ -2117,8 +2147,9 @@ static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsct return (err) ? -EFAULT : 0; } -asmlinkage long compat_sys_nfsservctl(int cmd, struct compat_nfsctl_arg __user *arg, - union compat_nfsctl_res __user *res) +asmlinkage long compat_sys_nfsservctl(int cmd, + struct compat_nfsctl_arg __user *arg, + union compat_nfsctl_res __user *res) { struct nfsctl_arg *karg; union nfsctl_res *kres; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5638c8f9362f..5f952187fc53 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -505,13 +505,15 @@ static int populate_groups(struct config_group *group) int i; if (group->default_groups) { - /* FYI, we're faking mkdir here + /* + * FYI, we're faking mkdir here * I'm not sure we need this semaphore, as we're called * from our parent's mkdir. That holds our parent's * i_mutex, so afaik lookup cannot continue through our * parent to find us, let alone mess with our tree. * That said, taking our i_mutex is closer to mkdir - * emulation, and shouldn't hurt. */ + * emulation, and shouldn't hurt. + */ mutex_lock(&dentry->d_inode->i_mutex); for (i = 0; group->default_groups[i]; i++) { @@ -546,20 +548,34 @@ static void unlink_obj(struct config_item *item) item->ci_group = NULL; item->ci_parent = NULL; + + /* Drop the reference for ci_entry */ config_item_put(item); + /* Drop the reference for ci_parent */ config_group_put(group); } } static void link_obj(struct config_item *parent_item, struct config_item *item) { - /* Parent seems redundant with group, but it makes certain - * traversals much nicer. */ + /* + * Parent seems redundant with group, but it makes certain + * traversals much nicer. + */ item->ci_parent = parent_item; + + /* + * We hold a reference on the parent for the child's ci_parent + * link. + */ item->ci_group = config_group_get(to_config_group(parent_item)); list_add_tail(&item->ci_entry, &item->ci_group->cg_children); + /* + * We hold a reference on the child for ci_entry on the parent's + * cg_children + */ config_item_get(item); } @@ -684,6 +700,10 @@ static void client_drop_item(struct config_item *parent_item, type = parent_item->ci_type; BUG_ON(!type); + /* + * If ->drop_item() exists, it is responsible for the + * config_item_put(). + */ if (type->ct_group_ops && type->ct_group_ops->drop_item) type->ct_group_ops->drop_item(to_config_group(parent_item), item); @@ -694,23 +714,28 @@ static void client_drop_item(struct config_item *parent_item, static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - int ret; + int ret, module_got = 0; struct config_group *group; struct config_item *item; struct config_item *parent_item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; struct config_item_type *type; - struct module *owner; + struct module *owner = NULL; char *name; - if (dentry->d_parent == configfs_sb->s_root) - return -EPERM; + if (dentry->d_parent == configfs_sb->s_root) { + ret = -EPERM; + goto out; + } sd = dentry->d_parent->d_fsdata; - if (!(sd->s_type & CONFIGFS_USET_DIR)) - return -EPERM; + if (!(sd->s_type & CONFIGFS_USET_DIR)) { + ret = -EPERM; + goto out; + } + /* Get a working ref for the duration of this function */ parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; subsys = to_config_group(parent_item)->cg_subsys; @@ -719,15 +744,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!type || !type->ct_group_ops || (!type->ct_group_ops->make_group && !type->ct_group_ops->make_item)) { - config_item_put(parent_item); - return -EPERM; /* What lack-of-mkdir returns */ + ret = -EPERM; /* Lack-of-mkdir returns -EPERM */ + goto out_put; } name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); if (!name) { - config_item_put(parent_item); - return -ENOMEM; + ret = -ENOMEM; + goto out_put; } + snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); down(&subsys->su_sem); @@ -748,40 +774,67 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) kfree(name); if (!item) { - config_item_put(parent_item); - return -ENOMEM; + /* + * If item == NULL, then link_obj() was never called. + * There are no extra references to clean up. + */ + ret = -ENOMEM; + goto out_put; } - ret = -EINVAL; + /* + * link_obj() has been called (via link_group() for groups). + * From here on out, errors must clean that up. + */ + type = item->ci_type; - if (type) { - owner = type->ct_owner; - if (try_module_get(owner)) { - if (group) { - ret = configfs_attach_group(parent_item, - item, - dentry); - } else { - ret = configfs_attach_item(parent_item, - item, - dentry); - } + if (!type) { + ret = -EINVAL; + goto out_unlink; + } - if (ret) { - down(&subsys->su_sem); - if (group) - unlink_group(group); - else - unlink_obj(item); - client_drop_item(parent_item, item); - up(&subsys->su_sem); + owner = type->ct_owner; + if (!try_module_get(owner)) { + ret = -EINVAL; + goto out_unlink; + } - config_item_put(parent_item); - module_put(owner); - } - } + /* + * I hate doing it this way, but if there is + * an error, module_put() probably should + * happen after any cleanup. + */ + module_got = 1; + + if (group) + ret = configfs_attach_group(parent_item, item, dentry); + else + ret = configfs_attach_item(parent_item, item, dentry); + +out_unlink: + if (ret) { + /* Tear down everything we built up */ + down(&subsys->su_sem); + if (group) + unlink_group(group); + else + unlink_obj(item); + client_drop_item(parent_item, item); + up(&subsys->su_sem); + + if (module_got) + module_put(owner); } +out_put: + /* + * link_obj()/link_group() took a reference from child->parent, + * so the parent is safely pinned. We can drop our working + * reference. + */ + config_item_put(parent_item); + +out: return ret; } @@ -801,6 +854,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) if (sd->s_type & CONFIGFS_USET_DEFAULT) return -EPERM; + /* Get a working ref until we have the child */ parent_item = configfs_get_config_item(dentry->d_parent); subsys = to_config_group(parent_item)->cg_subsys; BUG_ON(!subsys); @@ -817,6 +871,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) return ret; } + /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); /* Drop reference from above, item already holds one. */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 85d166cdcae4..b55b4ea9a676 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -67,12 +67,13 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d static int debugfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode *inode = debugfs_get_inode(dir->i_sb, mode, dev); + struct inode *inode; int error = -EPERM; if (dentry->d_inode) return -EEXIST; + inode = debugfs_get_inode(dir->i_sb, mode, dev); if (inode) { d_instantiate(dentry, inode); dget(dentry); diff --git a/fs/exec.c b/fs/exec.c index 3234a0c32d54..3a79d97ac234 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct task_struct *parent; struct dentry *proc_dentry1, *proc_dentry2; - unsigned long ptrace; /* * Wait for the thread group leader to be a zombie. @@ -704,22 +702,6 @@ static int de_thread(struct task_struct *tsk) * two threads with a switched PID, and release * the former thread group leader: */ - ptrace = leader->ptrace; - parent = leader->parent; - if (unlikely(ptrace) && unlikely(parent == current)) { - /* - * Joker was ptracing his own group leader, - * and now he wants to be his own parent! - * We can't have that. - */ - ptrace = 0; - } - - ptrace_unlink(current); - ptrace_unlink(leader); - remove_parent(current); - remove_parent(leader); - /* Become a process group leader with the old leader's pid. * Note: The old leader also uses thispid until release_task @@ -730,10 +712,8 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail(¤t->tasks, &init_task.tasks); + list_add_tail_rcu(¤t->tasks, &init_task.tasks); - current->parent = current->real_parent = leader->real_parent; - leader->parent = leader->real_parent = child_reaper; current->group_leader = current; leader->group_leader = current; @@ -742,13 +722,6 @@ static int de_thread(struct task_struct *tsk) detach_pid(leader, PIDTYPE_SID); list_del_init(&leader->tasks); - add_parent(current); - add_parent(leader); - if (ptrace) { - current->ptrace = ptrace; - __ptrace_link(current, parent); - } - current->exit_signal = SIGCHLD; BUG_ON(leader->exit_state != EXIT_ZOMBIE); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b06b54f1bbbb..4c39009350f3 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -102,7 +102,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, if (acceptable(context, result)) return result; if (S_ISDIR(result->d_inode->i_mode)) { - /* there is no other dentry, so fail */ + err = -EACCES; goto err_result; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 48ae0339af17..2edd7eec88fd 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -711,7 +711,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, * direct blocks blocks */ if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key + 1); + current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < blks; i++) *(where->p + i ) = cpu_to_le32(current_block++); } @@ -724,7 +724,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, if (block_i) { block_i->last_alloc_logical_block = block + blks - 1; block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key + blks - 1); + le32_to_cpu(where[num].key) + blks - 1; } /* We are done with atomic stuff, now do the rest of housekeeping */ @@ -814,11 +814,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { - first_block = chain[depth - 1].key; + first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { + unsigned long blk; + if (!verify_chain(chain, partial)) { /* * Indirect block might be removed by @@ -831,8 +833,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, count = 0; break; } - if (le32_to_cpu(*(chain[depth-1].p+count) == - (first_block + count))) + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) count++; else break; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index aaf1da17b6d4..8c22aa9a7fbb 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (!S_ISDIR(inode->i_mode)) flags &= ~EXT3_DIRSYNC_FL; + mutex_lock(&inode->i_mutex); oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } /* @@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + mutex_unlock(&inode->i_mutex); return PTR_ERR(handle); + } if (IS_SYNC(inode)) handle->h_sync = 1; err = ext3_reserve_inode_write(handle, inode, &iloc); @@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) + if (err) { + mutex_unlock(&inode->i_mutex); return err; + } if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); + mutex_unlock(&inode->i_mutex); return err; } case EXT3_IOC_GETVERSION: diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 14f5f6ea3e72..34b39e9a1e5a 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; } lock_buffer(bh); - memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size); set_buffer_uptodate(gdb); unlock_buffer(bh); ext3_journal_dirty_metadata(handle, gdb); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6c740f860665..104a62dadb94 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -92,40 +92,52 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; sigset_t oldset; + int intr; int err; + atomic_inc(&fc->num_waiting); block_sigs(&oldset); - err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); restore_sigs(&oldset); - if (err) - return ERR_PTR(-EINTR); + err = -EINTR; + if (intr) + goto out; req = fuse_request_alloc(); + err = -ENOMEM; if (!req) - return ERR_PTR(-ENOMEM); + goto out; - atomic_inc(&fc->num_waiting); - fuse_request_init(req); req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; + req->waiting = 1; return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - atomic_dec(&fc->num_waiting); + if (req->waiting) + atomic_dec(&fc->num_waiting); fuse_request_free(req); } } +/* + * Called with sbput_sem held for read (request_end) or write + * (fuse_put_super). By the time fuse_put_super() is finished, all + * inodes belonging to background requests must be released, so the + * iputs have to be done within the locked region. + */ void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); - if (req->file) - fput(req->file); spin_lock(&fc->lock); list_del(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { @@ -170,6 +182,11 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (fc->mounted) fuse_release_background(fc, req); up_read(&fc->sbput_sem); + + /* fput must go outside sbput_sem, otherwise it can deadlock */ + if (req->file) + fput(req->file); + if (end) end(fc, req); else @@ -277,6 +294,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } wake_up(&fc->waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e4f041a11bb5..fc342cf7c2cc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -565,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, buf += nres; if (nres != nbytes) break; - if (count) - fuse_reset_request(req); + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } } fuse_put_request(fc, req); if (res > 0) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 19c7185a7546..0474202cb5dc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -159,6 +159,9 @@ struct fuse_req { /** Data is being copied to/from the request */ unsigned locked:1; + /** Request is counted as "waiting" */ + unsigned waiting:1; + /** State of the request */ enum fuse_req_state state; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd34037b0588..7627022446b2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -500,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->f_op != &fuse_dev_operations) return -EINVAL; - /* Setting file->private_data can't race with other mount() - instances, since BKL is held for ->get_sb() */ - if (file->private_data) - return -EINVAL; - fc = new_conn(); if (!fc) return -ENOMEM; @@ -540,6 +535,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_free_req; + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + err = -EINVAL; + if (file->private_data) + goto err_kobject_del; + sb->s_root = root_dentry; fc->mounted = 1; fc->connected = 1; @@ -556,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return 0; + err_kobject_del: + kobject_del(&fc->kobj); err_free_req: fuse_request_free(init_req); err_put_root: diff --git a/fs/inotify.c b/fs/inotify.c index 1f50302849c5..732ec4bd5774 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -848,7 +848,11 @@ static int inotify_release(struct inode *ignored, struct file *file) inode = watch->inode; mutex_lock(&inode->inotify_mutex); mutex_lock(&dev->mutex); - remove_watch_no_event(watch, dev); + + /* make sure we didn't race with another list removal */ + if (likely(idr_find(&dev->idr, watch->wd))) + remove_watch_no_event(watch, dev); + mutex_unlock(&dev->mutex); mutex_unlock(&inode->inotify_mutex); put_inotify_watch(watch); @@ -890,8 +894,7 @@ static int inotify_ignore(struct inotify_device *dev, s32 wd) mutex_lock(&dev->mutex); /* make sure that we did not race */ - watch = idr_find(&dev->idr, wd); - if (likely(watch)) + if (likely(idr_find(&dev->idr, wd) == watch)) remove_watch(watch, dev); mutex_unlock(&dev->mutex); diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index d4d0c41490cd..1d46677afd17 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -438,7 +438,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info if (c->mtd->point) { err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer); if (!err && retlen < tn->csize) { - JFFS2_WARNING("MTD point returned len too short: %u instead of %u.\n", retlen, tn->csize); + JFFS2_WARNING("MTD point returned len too short: %zu " + "instead of %u.\n", retlen, tn->csize); c->mtd->unpoint(c->mtd, buffer, ofs, len); } else if (err) JFFS2_WARNING("MTD point failed: error code %d.\n", err); @@ -461,7 +462,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info } if (retlen != len) { - JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ofs, retlen, len); + JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", + ofs, retlen, len); err = -EIO; goto free_out; } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index f28696f235c4..2b220dd6b4e7 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -542,7 +542,7 @@ add_failed: static int metapage_releasepage(struct page *page, gfp_t gfp_mask) { struct metapage *mp; - int busy = 0; + int ret = 1; unsigned int offset; for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { @@ -552,30 +552,20 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) continue; jfs_info("metapage_releasepage: mp = 0x%p", mp); - if (mp->count || mp->nohomeok) { + if (mp->count || mp->nohomeok || + test_bit(META_dirty, &mp->flag)) { jfs_info("count = %ld, nohomeok = %d", mp->count, mp->nohomeok); - busy = 1; + ret = 0; continue; } - wait_on_page_writeback(page); - //WARN_ON(test_bit(META_dirty, &mp->flag)); - if (test_bit(META_dirty, &mp->flag)) { - dump_mem("dirty mp in metapage_releasepage", mp, - sizeof(struct metapage)); - dump_mem("page", page, sizeof(struct page)); - dump_stack(); - } if (mp->lsn) remove_from_logsync(mp); remove_metapage(page, mp); INCREMENT(mpStat.pagefree); free_metapage(mp); } - if (busy) - return -1; - - return 0; + return ret; } static void metapage_invalidatepage(struct page *page, unsigned long offset) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d2b66bad7d50..3ef739120dff 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) svc_wake_up(block->b_daemon); } -void nlmsvc_grant_release(void *data) +static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; diff --git a/fs/locks.c b/fs/locks.c index dda83d6cd48b..6f99c0a6f836 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -446,15 +446,14 @@ static struct lock_manager_operations lease_manager_ops = { */ static int lease_init(struct file *filp, int type, struct file_lock *fl) { + if (assign_type(fl, type) != 0) + return -EINVAL; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; fl->fl_flags = FL_LEASE; - if (assign_type(fl, type) != 0) { - locks_free_lock(fl); - return -EINVAL; - } fl->fl_start = 0; fl->fl_end = OFFSET_MAX; fl->fl_ops = NULL; @@ -466,16 +465,19 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) static int lease_alloc(struct file *filp, int type, struct file_lock **flp) { struct file_lock *fl = locks_alloc_lock(); - int error; + int error = -ENOMEM; if (fl == NULL) - return -ENOMEM; + goto out; error = lease_init(filp, type, fl); - if (error) - return error; + if (error) { + locks_free_lock(fl); + fl = NULL; + } +out: *flp = fl; - return 0; + return error; } /* Check if two locks overlap each other. @@ -1372,6 +1374,7 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) goto out; if (my_before != NULL) { + *flp = *my_before; error = lease->fl_lmops->fl_change(my_before, arg); goto out; } @@ -2230,7 +2233,12 @@ void steal_locks(fl_owner_t from) lock_kernel(); j = 0; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structures, so + * we need to acquire ->file_lock. + */ + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (;;) { unsigned long set; @@ -2248,7 +2256,7 @@ void steal_locks(fl_owner_t from) set >>= 1; } } - rcu_read_unlock(); + spin_unlock(&files->file_lock); unlock_kernel(); } EXPORT_SYMBOL(steal_locks); diff --git a/fs/namei.c b/fs/namei.c index 96723ae83c89..d6e2ee251736 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1080,8 +1080,8 @@ static int fastcall do_path_lookup(int dfd, const char *name, nd->flags = flags; nd->depth = 0; - read_lock(¤t->fs->lock); if (*name=='/') { + read_lock(¤t->fs->lock); if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { nd->mnt = mntget(current->fs->altrootmnt); nd->dentry = dget(current->fs->altroot); @@ -1092,33 +1092,35 @@ static int fastcall do_path_lookup(int dfd, const char *name, } nd->mnt = mntget(current->fs->rootmnt); nd->dentry = dget(current->fs->root); + read_unlock(¤t->fs->lock); } else if (dfd == AT_FDCWD) { + read_lock(¤t->fs->lock); nd->mnt = mntget(current->fs->pwdmnt); nd->dentry = dget(current->fs->pwd); + read_unlock(¤t->fs->lock); } else { struct dentry *dentry; file = fget_light(dfd, &fput_needed); retval = -EBADF; if (!file) - goto unlock_fail; + goto out_fail; dentry = file->f_dentry; retval = -ENOTDIR; if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_unlock_fail; + goto fput_fail; retval = file_permission(file, MAY_EXEC); if (retval) - goto fput_unlock_fail; + goto fput_fail; nd->mnt = mntget(file->f_vfsmnt); nd->dentry = dget(dentry); fput_light(file, fput_needed); } - read_unlock(¤t->fs->lock); current->total_link_count = 0; retval = link_path_walk(name, nd); out: @@ -1127,13 +1129,12 @@ out: nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode, flags); } +out_fail: return retval; -fput_unlock_fail: +fput_fail: fput_light(file, fput_needed); -unlock_fail: - read_unlock(¤t->fs->lock); - return retval; + goto out_fail; } int fastcall path_lookup(const char *name, unsigned int flags, diff --git a/fs/namespace.c b/fs/namespace.c index 2c5f1f80bdc2..bf478addb852 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -899,13 +899,11 @@ static int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags) +static int do_loopback(struct nameidata *nd, char *old_name, int recurse) { struct nameidata old_nd; struct vfsmount *mnt = NULL; - int recurse = flags & MS_REC; int err = mount_is_safe(nd); - if (err) return err; if (!old_name || !*old_name) @@ -939,7 +937,6 @@ static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags spin_unlock(&vfsmount_lock); release_mounts(&umount_list); } - mnt->mnt_flags = mnt_flags; out: up_write(&namespace_sem); @@ -1353,7 +1350,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags, mnt_flags); + retval = do_loopback(&nd, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a23f34894167..cae74dd4c7f5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = { static int nfs_opendir(struct inode *inode, struct file *filp) { - int res = 0; + int res; dfprintk(VFS, "NFS: opendir(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); lock_kernel(); /* Call generic open code in order to cache credentials */ - if (!res) - res = nfs_open(inode, filp); + res = nfs_open(inode, filp); unlock_kernel(); return res; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0f583cb16ddb..3c72b0c07283 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - struct dentry *dentry = iocb->ki_filp->f_dentry; - dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - dentry->d_name.name, (long long) pos, nr_segs); + iocb->ki_filp->f_dentry->d_name.name, + (long long) pos, nr_segs); return -EINVAL; } @@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = { static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task = &data->task; data->inode = dreq->inode; data->cred = dreq->ctx->cred; @@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ dreq->commit_data = NULL; - dprintk("NFS: %5u initiated commit call\n", task->tk_pid); + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); lock_kernel(); rpc_execute(&data->task); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f1df2c8d9259..fade02c15e6e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - inode->i_sb->s_id, inode->i_ino, + filp->f_dentry->d_inode->i_sb->s_id, + filp->f_dentry->d_inode->i_ino, fl->fl_type, fl->fl_flags); /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f7656b911b6..d0b991a92327 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display superblock I/O counters */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct nfs_iostats *stats; - if (!cpu_possible(cpu)) - continue; - preempt_disable(); stats = per_cpu_ptr(nfss->io_stats, cpu); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47ece1dd3c67..d86c0db7b1e8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1218,7 +1218,7 @@ out: return status; } -static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) { struct file *filp; @@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st struct nfs_open_context *ctx; ctx = (struct nfs_open_context *)filp->private_data; ctx->state = state; - } else - nfs4_close_state(state, nd->intent.open.flags); + return 0; + } + nfs4_close_state(state, nd->intent.open.flags); + return PTR_ERR(filp); } struct dentry * @@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_setattr_update_inode(state->inode, sattr); } if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) - nfs4_intent_set_file(nd, dentry, state); + status = nfs4_intent_set_file(nd, dentry, state); else nfs4_close_state(state, flags); out: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 4e0578121d9a..3eec30000f3f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1066,9 +1066,11 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, rv = nfserr_perm; else if (IS_ERR(exp)) rv = nfserrno(PTR_ERR(exp)); - else + else { rv = fh_compose(fhp, exp, fsid_key->ek_dentry, NULL); + exp_put(exp); + } cache_put(&fsid_key->h, &svc_expkey_cache); return rv; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6aa92d0e6876..1d65f13f458c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1922,11 +1922,10 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; - size = posix_acl_to_xattr(acl, value, size); - if (size < 0) { - error = size; + error = posix_acl_to_xattr(acl, value, size); + if (error < 0) goto getout; - } + size = error; } else size = 0; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0d858d0b25be..47152bf9a7f2 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -276,13 +276,29 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } +/* This can also be called from ocfs2_write_zero_page() which has done + * it's own cluster locking. */ +int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + int ret; + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + ret = block_prepare_write(page, from, to, ocfs2_get_block); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + return ret; +} + /* * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called * from loopback. It must be able to perform its own locking around * ocfs2_get_block(). */ -int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) { struct inode *inode = page->mapping->host; int ret; @@ -295,11 +311,7 @@ int ocfs2_prepare_write(struct file *file, struct page *page, goto out; } - down_read(&OCFS2_I(inode)->ip_alloc_sem); - - ret = block_prepare_write(page, from, to, ocfs2_get_block); - - up_read(&OCFS2_I(inode)->ip_alloc_sem); + ret = ocfs2_prepare_write_nolock(inode, page, from, to); ocfs2_meta_unlock(inode, 0); out: @@ -625,11 +637,31 @@ static ssize_t ocfs2_direct_IO(int rw, int ret; mlog_entry_void(); + + /* + * We get PR data locks even for O_DIRECT. This allows + * concurrent O_DIRECT I/O but doesn't let O_DIRECT with + * extending and buffered zeroing writes race. If they did + * race then the buffered zeroing could be written back after + * the O_DIRECT I/O. It's one thing to tell people not to mix + * buffered and O_DIRECT writes, but expecting them to + * understand that file extension is also an implicit buffered + * write is too much. By getting the PR we force writeback of + * the buffered zeroing before proceeding. + */ + ret = ocfs2_data_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_data_unlock(inode, 0); + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io); +out: mlog_exit(ret); return ret; } diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index d40456d509a0..e88c3f0b8fa9 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,8 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to); +int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, + unsigned from, unsigned to); struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 4601fc256f11..1a5c69071df6 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -569,7 +569,7 @@ static int ocfs2_extent_map_insert(struct inode *inode, ret = -ENOMEM; ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); + GFP_NOFS); if (!ctxt.new_ent) { mlog_errno(ret); return ret; @@ -583,14 +583,14 @@ static int ocfs2_extent_map_insert(struct inode *inode, if (ctxt.need_left && !ctxt.left_ent) { ctxt.left_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); + GFP_NOFS); if (!ctxt.left_ent) break; } if (ctxt.need_right && !ctxt.right_ent) { ctxt.right_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); + GFP_NOFS); if (!ctxt.right_ent) break; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 581eb451a41a..a9559c874530 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -613,7 +613,8 @@ leave: /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to - * worry about recursive locking in ->commit_write(). */ + * worry about recursive locking in ->prepare_write() and + * ->commit_write(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 size) { @@ -641,7 +642,7 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write(NULL, page, offset, offset); + ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -695,13 +696,26 @@ out: return ret; } +/* + * A tail_to_skip value > 0 indicates that we're being called from + * ocfs2_file_aio_write(). This has the following implications: + * + * - we don't want to update i_size + * - di_bh will be NULL, which is fine because it's only used in the + * case where we want to update i_size. + * - ocfs2_zero_extend() will then only be filling the hole created + * between i_size and the start of the write. + */ static int ocfs2_extend_file(struct inode *inode, struct buffer_head *di_bh, - u64 new_i_size) + u64 new_i_size, + size_t tail_to_skip) { int ret = 0; u32 clusters_to_add; + BUG_ON(!tail_to_skip && !di_bh); + /* setattr sometimes calls us like this. */ if (new_i_size == 0) goto out; @@ -714,27 +728,44 @@ static int ocfs2_extend_file(struct inode *inode, OCFS2_I(inode)->ip_clusters; if (clusters_to_add) { - ret = ocfs2_extend_allocation(inode, clusters_to_add); + /* + * protect the pages that ocfs2_zero_extend is going to + * be pulling into the page cache.. we do this before the + * metadata extend so that we don't get into the situation + * where we've extended the metadata but can't get the data + * lock to zero. + */ + ret = ocfs2_data_lock(inode, 1); if (ret < 0) { mlog_errno(ret); goto out; } - ret = ocfs2_zero_extend(inode, new_i_size); + ret = ocfs2_extend_allocation(inode, clusters_to_add); if (ret < 0) { mlog_errno(ret); - goto out; + goto out_unlock; } - } - /* No allocation required, we just use this helper to - * do a trivial update of i_size. */ - ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); - if (ret < 0) { - mlog_errno(ret); - goto out; + ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + } + + if (!tail_to_skip) { + /* We're being called from ocfs2_setattr() which wants + * us to update i_size */ + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); } +out_unlock: + if (clusters_to_add) /* this is the only case in which we lock */ + ocfs2_data_unlock(inode, 1); + out: return ret; } @@ -793,7 +824,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (i_size_read(inode) > attr->ia_size) status = ocfs2_truncate_file(inode, bh, attr->ia_size); else - status = ocfs2_extend_file(inode, bh, attr->ia_size); + status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1049,21 +1080,12 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (!clusters) break; - ret = ocfs2_extend_allocation(inode, clusters); + ret = ocfs2_extend_file(inode, NULL, newsize, count); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } - - /* Fill any holes which would've been created by this - * write. If we're O_APPEND, this will wind up - * (correctly) being a noop. */ - ret = ocfs2_zero_extend(inode, (u64) newsize - count); - if (ret < 0) { - mlog_errno(ret); - goto out; - } break; } @@ -1146,6 +1168,22 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ocfs2_iocb_set_rw_locked(iocb); } + /* + * We're fine letting folks race truncates and extending + * writes with read across the cluster, just like they can + * locally. Hence no rw_lock during read. + * + * Take and drop the meta data lock to update inode fields + * like i_size. This allows the checks down below + * generic_file_aio_read() a chance of actually working. + */ + ret = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_meta_unlock(inode, 0); + ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); if (ret == -EINVAL) mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 6a610ae53583..eebc3cfa6be8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -117,7 +117,7 @@ struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) { struct ocfs2_journal_handle *retval = NULL; - retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); + retval = kcalloc(1, sizeof(*retval), GFP_NOFS); if (!retval) { mlog(ML_ERROR, "Failed to allocate memory for journal " "handle!\n"); @@ -870,9 +870,11 @@ static int ocfs2_force_read_journal(struct inode *inode) if (p_blocks > CONCURRENT_JOURNAL_FILL) p_blocks = CONCURRENT_JOURNAL_FILL; + /* We are reading journal data which should not + * be put in the uptodate cache */ status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), p_blkno, p_blocks, bhs, 0, - inode); + NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -982,7 +984,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, { struct ocfs2_la_recovery_item *item; - item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); + item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS); if (!item) { /* Though we wish to avoid it, we are in fact safe in * skipping local alloc cleanup as fsck.ocfs2 is more diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 04a684dfdd96..b8a00a793326 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -337,7 +337,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, (unsigned long long)oi->ip_blkno, (unsigned long long)block, expand_tree); - new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); + new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); if (!new) { mlog_errno(-ENOMEM); return; @@ -349,7 +349,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, * has no way of tracking that. */ for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, - GFP_KERNEL); + GFP_NOFS); if (!tree[i]) { mlog_errno(-ENOMEM); goto out_free; diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index 53049a204197..ee42765a8553 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -586,7 +586,7 @@ static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response { struct ocfs2_net_wait_ctxt *w; - w = kcalloc(1, sizeof(*w), GFP_KERNEL); + w = kcalloc(1, sizeof(*w), GFP_NOFS); if (!w) { mlog_errno(-ENOMEM); goto bail; @@ -749,7 +749,7 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, BUG_ON(!ocfs2_is_valid_vote_request(type)); - request = kcalloc(1, sizeof(*request), GFP_KERNEL); + request = kcalloc(1, sizeof(*request), GFP_NOFS); if (!request) { mlog_errno(-ENOMEM); } else { @@ -1129,7 +1129,7 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, struct ocfs2_super *osb = data; struct ocfs2_vote_work *work; - work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL); + work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS); if (!work) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/open.c b/fs/open.c index c32c89d6d8db..317b7c7f38a7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -331,7 +331,10 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { - return do_sys_ftruncate(fd, length, 1); + long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } /* LFS versions of truncate are only needed on 32 bit machines */ @@ -343,7 +346,10 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { - return do_sys_ftruncate(fd, length, 0); + long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } #endif @@ -1093,22 +1099,31 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) asmlinkage long sys_open(const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } -EXPORT_SYMBOL_GPL(sys_openat); #ifndef __alpha__ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index af0cb4b9e784..7ef1f094de91 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part) devfs_remove("%s/part%d", disk->devfs_name, part); if (p->holder_dir) kobject_unregister(p->holder_dir); - kobject_unregister(&p->kobj); + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + kobject_put(&p->kobj); } void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) @@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part); p->kobj.parent = &disk->kobj; p->kobj.ktype = &ktype_part; - kobject_register(&p->kobj); + kobject_init(&p->kobj); + kobject_add(&p->kobj); + if (!disk->part_uevent_suppress) + kobject_uevent(&p->kobj, KOBJ_ADD); partition_sysfs_add_subdir(p); disk->part[part-1] = p; } @@ -367,6 +372,7 @@ static char *make_block_name(struct gendisk *disk) char *name; static char *block_str = "block:"; int size; + char *s; size = strlen(block_str) + strlen(disk->disk_name) + 1; name = kmalloc(size, GFP_KERNEL); @@ -374,6 +380,10 @@ static char *make_block_name(struct gendisk *disk) return NULL; strcpy(name, block_str); strcat(name, disk->disk_name); + /* ewww... some of these buggers have / in name... */ + s = strchr(name, '/'); + if (s) + *s = '!'; return name; } @@ -395,6 +405,8 @@ void register_disk(struct gendisk *disk) { struct block_device *bdev; char *s; + int i; + struct hd_struct *p; int err; strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN); @@ -406,13 +418,12 @@ void register_disk(struct gendisk *disk) return; disk_sysfs_symlinks(disk); disk_sysfs_add_subdirs(disk); - kobject_uevent(&disk->kobj, KOBJ_ADD); /* No minors to use for partitions */ if (disk->minors == 1) { if (disk->devfs_name[0] != '\0') devfs_add_disk(disk); - return; + goto exit; } /* always add handle for the whole disk */ @@ -420,16 +431,32 @@ void register_disk(struct gendisk *disk) /* No such device (e.g., media were just removed) */ if (!get_capacity(disk)) - return; + goto exit; bdev = bdget_disk(disk, 0); if (!bdev) - return; + goto exit; + /* scan partition table, but suppress uevents */ bdev->bd_invalidated = 1; - if (blkdev_get(bdev, FMODE_READ, 0) < 0) - return; + disk->part_uevent_suppress = 1; + err = blkdev_get(bdev, FMODE_READ, 0); + disk->part_uevent_suppress = 0; + if (err < 0) + goto exit; blkdev_put(bdev); + +exit: + /* announce disk after possible partitions are already created */ + kobject_uevent(&disk->kobj, KOBJ_ADD); + + /* announce possible partitions */ + for (i = 1; i < disk->minors; i++) { + p = disk->part[i-1]; + if (!p || !p->nr_sects) + continue; + kobject_uevent(&p->kobj, KOBJ_ADD); + } } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) @@ -506,6 +533,7 @@ void del_gendisk(struct gendisk *disk) devfs_remove_disk(disk); + kobject_uevent(&disk->kobj, KOBJ_REMOVE); if (disk->holder_dir) kobject_unregister(disk->holder_dir); if (disk->slave_dir) @@ -518,7 +546,7 @@ void del_gendisk(struct gendisk *disk) kfree(disk_name); } put_device(disk->driverfs_dev); + disk->driverfs_dev = NULL; } - kobject_uevent(&disk->kobj, KOBJ_REMOVE); kobject_del(&disk->kobj); } diff --git a/fs/pipe.c b/fs/pipe.c index e984beb93a0e..5acd8954aaa0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -55,7 +55,8 @@ void pipe_wait(struct pipe_inode_info *pipe) } static int -pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) +pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, + int atomic) { unsigned long copy; @@ -64,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_from_user(to, iov->iov_base, copy)) - return -EFAULT; + if (atomic) { + if (__copy_from_user_inatomic(to, iov->iov_base, copy)) + return -EFAULT; + } else { + if (copy_from_user(to, iov->iov_base, copy)) + return -EFAULT; + } to += copy; len -= copy; iov->iov_base += copy; @@ -75,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) } static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) +pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, + int atomic) { unsigned long copy; @@ -84,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; + if (atomic) { + if (__copy_to_user_inatomic(iov->iov_base, from, copy)) + return -EFAULT; + } else { + if (copy_to_user(iov->iov_base, from, copy)) + return -EFAULT; + } from += copy; len -= copy; iov->iov_base += copy; @@ -94,13 +106,52 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } +/* + * Attempt to pre-fault in the user memory, so we can use atomic copies. + * Returns the number of bytes not faulted in. + */ +static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + if (fault_in_pages_writeable(iov->iov_base, this_len)) + break; + + len -= this_len; + iov++; + } + + return len; +} + +/* + * Pre-fault in the user memory, so we can use atomic copies. + */ +static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + fault_in_pages_readable(iov->iov_base, this_len); + len -= this_len; + iov++; + } +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - buf->flags &= ~PIPE_BUF_FLAG_STOLEN; - /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep @@ -112,31 +163,58 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, page_cache_release(page); } -static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +void *generic_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) { + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); +} + +int generic_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} + +void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) { - kunmap(buf->page); + page_cache_get(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) { - buf->flags |= PIPE_BUF_FLAG_STOLEN; return 0; } static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = anon_pipe_buf_map, - .unmap = anon_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, }; static ssize_t @@ -167,22 +245,33 @@ pipe_readv(struct file *filp, const struct iovec *_iov, struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; - int error; + int error, atomic; if (chars > total_len) chars = total_len; - addr = ops->map(filp, pipe, buf); - if (IS_ERR(addr)) { + error = ops->pin(pipe, buf); + if (error) { if (!ret) - ret = PTR_ERR(addr); + error = ret; break; } - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(pipe, buf); + + atomic = !iov_fault_in_pages_write(iov, chars); +redo: + addr = ops->map(pipe, buf, atomic); + error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); + ops->unmap(pipe, buf, addr); if (unlikely(error)) { + /* + * Just retry with the slow path if we failed. + */ + if (atomic) { + atomic = 0; + goto redo; + } if (!ret) - ret = -EFAULT; + ret = error; break; } ret += chars; @@ -286,21 +375,28 @@ pipe_writev(struct file *filp, const struct iovec *_iov, int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { + int error, atomic = 1; void *addr; - int error; - addr = ops->map(filp, pipe, buf); - if (IS_ERR(addr)) { - error = PTR_ERR(addr); + error = ops->pin(pipe, buf); + if (error) goto out; - } + + iov_fault_in_pages_read(iov, chars); +redo1: + addr = ops->map(pipe, buf, atomic); error = pipe_iov_copy_from_user(offset + addr, iov, - chars); - ops->unmap(pipe, buf); + chars, atomic); + ops->unmap(pipe, buf, addr); ret = error; do_wakeup = 1; - if (error) + if (error) { + if (atomic) { + atomic = 0; + goto redo1; + } goto out; + } buf->len += chars; total_len -= chars; ret = chars; @@ -323,7 +419,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; - int error; + char *src; + int error, atomic = 1; if (!page) { page = alloc_page(GFP_HIGHUSER); @@ -343,11 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - error = pipe_iov_copy_from_user(kmap(page), iov, chars); - kunmap(page); + iov_fault_in_pages_read(iov, chars); +redo2: + if (atomic) + src = kmap_atomic(page, KM_USER0); + else + src = kmap(page); + + error = pipe_iov_copy_from_user(src, iov, chars, + atomic); + if (atomic) + kunmap_atomic(src, KM_USER0); + else + kunmap(page); + if (unlikely(error)) { + if (atomic) { + atomic = 0; + goto redo2; + } if (!ret) - ret = -EFAULT; + ret = error; break; } ret += chars; diff --git a/fs/proc/base.c b/fs/proc/base.c index a3a3eecef689..6cc77dc3f3ff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm files = get_files_struct(task); if (files) { - rcu_read_lock(); + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { *mnt = mntget(file->f_vfsmnt); *dentry = dget(file->f_dentry); - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); return 0; } - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); } return -ENOENT; @@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, if (!files) goto out_unlock; inode->i_mode = S_IFLNK; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) goto out_unlock2; @@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, return NULL; out_unlock2: - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 58c418fbca2c..97ae1b92bc47 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct inode *inode) acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); reiserfs_read_unlock_xattrs(inode->i_sb); reiserfs_read_unlock_xattr_i(inode); - ret = acl ? 1 : 0; - posix_acl_release(acl); + ret = (acl && !IS_ERR(acl)); + if (ret) + posix_acl_release(acl); } return ret; diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 34c7a11d91f0..70d9c5a37f5a 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -434,6 +434,11 @@ smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (dentry->d_name.len > SMB_MAXNAMELEN) goto out; + /* Do not allow lookup of names with backslashes in */ + error = -EINVAL; + if (memchr(dentry->d_name.name, '\\', dentry->d_name.len)) + goto out; + lock_kernel(); error = smb_proc_getattr(dentry, &finfo); #ifdef SMBFS_PARANOIA diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c71c375863cc..c71dd2760d32 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -339,9 +339,11 @@ int smb_add_request(struct smb_request *req) /* * On timeout or on interrupt we want to try and remove the * request from the recvq/xmitq. + * First check if the request is still part of a queue. (May + * have been removed by some error condition) */ smb_lock_server(server); - if (!(req->rq_flags & SMB_REQ_RECEIVED)) { + if (!list_empty(&req->rq_queue)) { list_del_init(&req->rq_queue); smb_rput(req); } diff --git a/fs/splice.c b/fs/splice.c index e50a460239dd..a285fd746dc0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -27,15 +27,22 @@ #include <linux/buffer_head.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/uio.h> + +struct partial_page { + unsigned int offset; + unsigned int len; +}; /* - * Passed to the actors + * Passed to splice_to_pipe */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ +struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ + int nr_pages; /* number of pages in map */ unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ - loff_t pos; /* file position */ + struct pipe_buf_operations *ops;/* ops associated with output pipe */ }; /* @@ -44,13 +51,14 @@ struct splice_desc { * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ -static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, +static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping = page_mapping(page); - WARN_ON(!PageLocked(page)); + lock_page(page); + WARN_ON(!PageUptodate(page)); /* @@ -65,24 +73,24 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); - if (!remove_mapping(mapping, page)) + if (!remove_mapping(mapping, page)) { + unlock_page(page); return 1; + } - buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; + buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } -static void page_cache_pipe_buf_release(struct pipe_inode_info *info, +static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_release(buf->page); - buf->page = NULL; - buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); + buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static void *page_cache_pipe_buf_map(struct file *file, - struct pipe_inode_info *info, - struct pipe_buffer *buf) +static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -108,44 +116,59 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * Page is ok afterall, fall through to mapping. + * Page is ok afterall, we are done. */ unlock_page(page); } - return kmap(page); + return 0; error: unlock_page(page); - return ERR_PTR(err); + return err; } -static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, - struct pipe_buffer *buf) +static struct pipe_buf_operations page_cache_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = page_cache_pipe_buf_pin, + .release = page_cache_pipe_buf_release, + .steal = page_cache_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { - kunmap(buf->page); + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + buf->flags |= PIPE_BUF_FLAG_LRU; + return generic_pipe_buf_steal(pipe, buf); } -static struct pipe_buf_operations page_cache_pipe_buf_ops = { +static struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = page_cache_pipe_buf_map, - .unmap = page_cache_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = page_cache_pipe_buf_release, - .steal = page_cache_pipe_buf_steal, + .steal = user_page_pipe_buf_steal, + .get = generic_pipe_buf_get, }; /* * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long offset, - unsigned long len, unsigned int flags) +static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { - int ret, do_wakeup, i; + int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; - i = 0; + page_nr = 0; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -161,27 +184,22 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, if (pipe->nrbufs < PIPE_BUFFERS) { int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; - struct page *page = pages[i++]; - unsigned long this_len; - this_len = PAGE_CACHE_SIZE - offset; - if (this_len > len) - this_len = len; + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; - buf->page = page; - buf->offset = offset; - buf->len = this_len; - buf->ops = &page_cache_pipe_buf_ops; pipe->nrbufs++; + page_nr++; + ret += buf->len; + if (pipe->inode) do_wakeup = 1; - ret += this_len; - len -= this_len; - offset = 0; - if (!--nr_pages) - break; - if (!len) + if (!--spd->nr_pages) break; if (pipe->nrbufs < PIPE_BUFFERS) continue; @@ -189,7 +207,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - if (flags & SPLICE_F_NONBLOCK) { + if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -224,26 +242,36 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - while (i < nr_pages) - page_cache_release(pages[i++]); + while (page_nr < spd->nr_pages) + page_cache_release(spd->pages[page_nr++]); return ret; } static int -__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int offset, nr_pages; + unsigned int loff, nr_pages; struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; struct page *page; - pgoff_t index; - int i, error; - - index = in->f_pos >> PAGE_CACHE_SHIFT; - offset = in->f_pos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pgoff_t index, end_index; + loff_t isize; + size_t total_len; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + }; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) nr_pages = PIPE_BUFFERS; @@ -253,49 +281,94 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!offset || nr_pages > 1) - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!loff || nr_pages > 1) + page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); /* * Now fill in the holes: */ error = 0; - for (i = 0; i < nr_pages; i++, index++) { -find_page: + total_len = 0; + + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * allocate the rest. + */ + index += spd.nr_pages; + while (spd.nr_pages < nr_pages) { /* - * lookup the page for this index + * Page could be there, find_get_pages_contig() breaks on + * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* - * If in nonblock mode then dont block on - * readpage (we've kicked readahead so there - * will be asynchronous progress): + * Make sure the read-ahead engine is notified + * about this failure. */ - if (flags & SPLICE_F_NONBLOCK) - break; + handle_ra_miss(mapping, &in->f_ra, index); /* - * page didn't exist, allocate one + * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); + if (error == -EEXIST) + continue; break; } - - goto readpage; + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); } + pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = pages[page_nr]; + /* * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { + /* + * If in nonblock mode then dont block on waiting + * for an in-flight io page + */ + if (flags & SPLICE_F_NONBLOCK) + break; + lock_page(page); /* @@ -305,7 +378,6 @@ find_page: */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); break; } /* @@ -316,25 +388,66 @@ find_page: goto fill_it; } -readpage: /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - page_cache_release(page); + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ if (error == AOP_TRUNCATED_PAGE) - goto find_page; + error = 0; + break; } + + /* + * i_size must be checked after ->readpage(). + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); + if (total_len + loff > isize) + break; + /* + * force quit after adding this page + */ + len = this_len; + this_len = min(this_len, loff); + loff = 0; + } } fill_it: - pages[i] = page; + partial[page_nr].offset = loff; + partial[page_nr].len = this_len; + len -= this_len; + total_len += this_len; + loff = 0; + spd.nr_pages++; + index++; } - if (i) - return move_to_pipe(pipe, pages, i, offset, len, flags); + /* + * Release any pages at the end, if we quit early. 'i' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(pages[page_nr++]); + + if (spd.nr_pages) + return splice_to_pipe(pipe, &spd); return error; } @@ -348,8 +461,9 @@ fill_it: * * Will read pages from given file and fill them into a pipe. */ -ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t spliced; int ret; @@ -358,19 +472,22 @@ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, spliced = 0; while (len) { - ret = __generic_file_splice_read(in, pipe, len, flags); + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret <= 0) + if (ret < 0) break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } - in->f_pos += ret; + *ppos += ret; len -= ret; spliced += ret; - - if (!(flags & SPLICE_F_NONBLOCK)) - continue; - ret = -EAGAIN; - break; } if (spliced) @@ -383,38 +500,24 @@ EXPORT_SYMBOL(generic_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' - * using sendpage(). + * using sendpage(). Return the number of bytes sent. */ -static int pipe_to_sendpage(struct pipe_inode_info *info, +static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; loff_t pos = sd->pos; - unsigned int offset; - ssize_t ret; - void *ptr; - int more; - - /* - * Sub-optimal, but we are limited by the pipe ->map. We don't - * need a kmap'ed buffer here, we just want to make sure we - * have the page pinned if the pipe page originates from the - * page cache. - */ - ptr = buf->ops->map(file, info, buf); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); + int ret, more; - offset = pos & ~PAGE_CACHE_MASK; - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + ret = buf->ops->pin(pipe, buf); + if (!ret) { + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); - - buf->ops->unmap(info, buf); - if (ret == sd->len) - return 0; + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + } - return -EIO; + return ret; } /* @@ -437,62 +540,80 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create * a new page in the output file page cache and fill/dirty that. */ -static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, +static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); - unsigned int offset; + unsigned int offset, this_len; struct page *page; pgoff_t index; - char *src; int ret; /* * make sure the data in this buffer is uptodate */ - src = buf->ops->map(file, info, buf); - if (IS_ERR(src)) - return PTR_ERR(src); + ret = buf->ops->pin(pipe, buf); + if (unlikely(ret)) + return ret; index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + /* - * Reuse buf page, if SPLICE_F_MOVE is set. + * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full + * page. */ - if (sd->flags & SPLICE_F_MOVE) { + if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* - * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. + * If steal succeeds, buf->page is now pruned from the + * pagecache and we can reuse it. The page will also be + * locked on successful return. */ - if (buf->ops->steal(info, buf)) + if (buf->ops->steal(pipe, buf)) goto find_page; - /* - * this will also set the page locked - */ page = buf->page; - if (add_to_page_cache(page, mapping, index, gfp_mask)) + if (add_to_page_cache(page, mapping, index, gfp_mask)) { + unlock_page(page); goto find_page; + } + + page_cache_get(page); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add(page); } else { find_page: - ret = -ENOMEM; - page = find_or_create_page(mapping, index, gfp_mask); - if (!page) - goto out_nomem; + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_nomem; + + /* + * This will also lock the page + */ + ret = add_to_page_cache_lru(page, mapping, index, + gfp_mask); + if (unlikely(ret)) + goto out; + } /* - * If the page is uptodate, it is also locked. If it isn't - * uptodate, we can mark it uptodate if we are filling the - * full page. Otherwise we need to read it in first... + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. */ if (!PageUptodate(page)) { - if (sd->len < PAGE_CACHE_SIZE) { + if (this_len < PAGE_CACHE_SIZE) { ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; @@ -511,58 +632,72 @@ find_page: ret = -EIO; goto out; } - } else { - WARN_ON(!PageLocked(page)); + } else SetPageUptodate(page); - } } } - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); - if (ret == AOP_TRUNCATED_PAGE) { + ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); + if (unlikely(ret)) { + loff_t isize = i_size_read(mapping->host); + + if (ret != AOP_TRUNCATED_PAGE) + unlock_page(page); page_cache_release(page); - goto find_page; - } else if (ret) + if (ret == AOP_TRUNCATED_PAGE) + goto find_page; + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + if (sd->pos + this_len > isize) + vmtruncate(mapping->host, isize); + goto out; + } - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { - char *dst = kmap_atomic(page, KM_USER0); + if (buf->page != page) { + /* + * Careful, ->map() uses KM_USER0! + */ + char *src = buf->ops->map(pipe, buf, 1); + char *dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + buf->offset, sd->len); + memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst, KM_USER1); + buf->ops->unmap(pipe, buf, src); } - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); - if (ret == AOP_TRUNCATED_PAGE) { + ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); + if (!ret) { + /* + * Return the number of bytes written and mark page as + * accessed, we are now done! + */ + ret = this_len; + mark_page_accessed(page); + balance_dirty_pages_ratelimited(mapping); + } else if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; - } else if (ret) - goto out; - - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); -out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { - page_cache_release(page); - unlock_page(page); } +out: + page_cache_release(page); + unlock_page(page); out_nomem: - buf->ops->unmap(info, buf); return ret; } -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); - /* * Pipe input worker. Most of this logic works like a regular pipe, the * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags, - splice_actor *actor) +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) { int ret, do_wakeup, err; struct splice_desc sd; @@ -573,7 +708,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.total_len = len; sd.flags = flags; sd.file = out; - sd.pos = out->f_pos; + sd.pos = *ppos; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -588,16 +723,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.len = sd.total_len; err = actor(pipe, buf, &sd); - if (err) { + if (err <= 0) { if (!ret && err != -ENODATA) ret = err; break; } - ret += sd.len; - buf->offset += sd.len; - buf->len -= sd.len; + ret += err; + buf->offset += err; + buf->len -= err; + + sd.len -= err; + sd.pos += err; + sd.total_len -= err; + if (sd.len) + continue; if (!buf->len) { buf->ops = NULL; @@ -608,8 +749,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, do_wakeup = 1; } - sd.pos += sd.len; - sd.total_len -= sd.len; if (!sd.total_len) break; } @@ -656,9 +795,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - out->f_pos = sd.pos; return ret; - } /** @@ -674,28 +811,32 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, */ ssize_t generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); - - /* - * If file or inode is SYNC and we actually wrote some data, sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) - && ret > 0) { + ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + if (ret > 0) { struct inode *inode = mapping->host; - int err; - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); + *ppos += ret; - if (err) - ret = err; + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } } return ret; @@ -715,9 +856,9 @@ EXPORT_SYMBOL(generic_file_splice_write); * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -726,9 +867,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - loff_t pos; int ret; if (unlikely(!out->f_op || !out->f_op->splice_write)) @@ -737,22 +877,21 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; - pos = out->f_pos; - - ret = rw_verify_area(WRITE, out, &pos, len); + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, len, flags); + return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { - loff_t pos, isize, left; + loff_t isize, left; int ret; if (unlikely(!in->f_op || !in->f_op->splice_read)) @@ -761,28 +900,27 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; - pos = in->f_pos; - - ret = rw_verify_area(READ, in, &pos, len); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; isize = i_size_read(in->f_mapping->host); - if (unlikely(in->f_pos >= isize)) + if (unlikely(*ppos >= isize)) return 0; - left = isize - in->f_pos; + left = isize - *ppos; if (unlikely(left < len)) len = left; - return in->f_op->splice_read(in, pipe, len, flags); + return in->f_op->splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; long ret, bytes; + loff_t out_off; umode_t i_mode; int i; @@ -807,7 +945,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, /* * We don't have an immediate reader, but we'll read the stuff - * out of the pipe right after the move_to_pipe(). So set + * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; @@ -820,6 +958,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ ret = 0; bytes = 0; + out_off = 0; while (len) { size_t read_len, max_read_len; @@ -829,7 +968,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); - ret = do_splice_to(in, pipe, max_read_len, flags); + ret = do_splice_to(in, ppos, pipe, max_read_len, flags); if (unlikely(ret < 0)) goto out_release; @@ -840,7 +979,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, read_len, + ret = do_splice_from(pipe, out, &out_off, read_len, flags & ~SPLICE_F_NONBLOCK); if (unlikely(ret < 0)) goto out_release; @@ -898,6 +1037,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + loff_t offset, *off; + long ret; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -906,12 +1047,18 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) { if (out->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&out->f_pos, off_out, - sizeof(loff_t))) + if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &out->f_pos; - return do_splice_from(pipe, out, len, flags); + ret = do_splice_from(pipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } pipe = out->f_dentry->d_inode->i_pipe; @@ -921,16 +1068,201 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) { if (in->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &in->f_pos; - return do_splice_to(in, pipe, len, flags); + ret = do_splice_to(in, off, pipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } return -EINVAL; } +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial, int aligned) +{ + int buffers = 0, error = 0; + + /* + * It's ok to take the mmap_sem for reading, even + * across a "get_user()". + */ + down_read(¤t->mm->mmap_sem); + + while (nr_vecs) { + unsigned long off, npages; + void __user *base; + size_t len; + int i; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + error = -EFAULT; + if (unlikely(!base)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > PIPE_BUFFERS - buffers) + npages = PIPE_BUFFERS - buffers; + + error = get_user_pages(current, current->mm, + (unsigned long) base, npages, 0, 0, + &pages[buffers], NULL); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE - off); + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == PIPE_BUFFERS) + break; + + nr_vecs--; + iov++; + } + + up_read(¤t->mm->mmap_sem); + + if (buffers) + return buffers; + + return error; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + * + * Note that vmsplice only supports splicing _from_ user memory to a pipe, + * not the other way around. Splicing from user memory is a simple operation + * that can be supported without any funky alignment restrictions or nasty + * vm tricks. We simply map in the user memory and fill them into a pipe. + * The reverse isn't quite as easy, though. There are two possible solutions + * for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Alas, it isn't here. + * + */ +static long do_vmsplice(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + }; + + if (unlikely(!pipe)) + return -EBADF; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, + flags & SPLICE_F_GIFT); + if (spd.nr_pages <= 0) + return spd.nr_pages; + + return splice_to_pipe(pipe, &spd); +} + +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct file *file; + long error; + int fput; + + error = -EBADF; + file = fget_light(fd, &fput); + if (file) { + if (file->f_mode & FMODE_WRITE) + error = do_vmsplice(file, iov, nr_segs, flags); + + fput_light(file, fput); + } + + return error; +} + asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags) @@ -961,3 +1293,198 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret, do_wakeup, i, ipipe_first; + + ret = do_wakeup = ipipe_first = 0; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by inode address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + if (ipipe->inode < opipe->inode) { + ipipe_first = 1; + mutex_lock(&ipipe->inode->i_mutex); + mutex_lock(&opipe->inode->i_mutex); + } else { + mutex_lock(&opipe->inode->i_mutex); + mutex_lock(&ipipe->inode->i_mutex); + } + + for (i = 0;; i++) { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + if (ipipe->nrbufs - i) { + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + + /* + * If we have room, fill this buffer + */ + if (opipe->nrbufs < PIPE_BUFFERS) { + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + do_wakeup = 1; + ret += obuf->len; + len -= obuf->len; + + if (!len) + break; + if (opipe->nrbufs < PIPE_BUFFERS) + continue; + } + + /* + * We have input available, but no output room. + * If we already copied data, return that. If we + * need to drop the opipe lock, it must be ordered + * last to avoid deadlocks. + */ + if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + opipe->waiting_writers++; + pipe_wait(opipe); + opipe->waiting_writers--; + continue; + } + + /* + * No input buffers, do the usual checks for available + * writers and blocking and wait if necessary + */ + if (!ipipe->writers) + break; + if (!ipipe->waiting_writers) { + if (ret) + break; + } + /* + * pipe_wait() drops the ipipe mutex. To avoid deadlocks + * with another process, we can only safely do that if + * the ipipe lock is ordered last. + */ + if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (waitqueue_active(&ipipe->wait)) + wake_up_interruptible_sync(&ipipe->wait); + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + + pipe_wait(ipipe); + } + + mutex_unlock(&ipipe->inode->i_mutex); + mutex_unlock(&opipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + + /* + * Link ipipe to the two output pipes, consuming as we go along. + */ + if (ipipe && opipe) + return link_pipe(ipipe, opipe, len, flags); + + return -EINVAL; +} + +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} diff --git a/fs/stat.c b/fs/stat.c index 9948cc1685a4..0f282face322 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -261,7 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) return error; } -#ifndef __ARCH_WANT_STAT64 +#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag) { diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 6cfdc9a87772..610b5bdbe75b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_event, 0); INIT_LIST_HEAD(&sd->s_children); list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f1cb1ddde511..cf3786625bfa 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,6 +6,7 @@ #include <linux/fsnotify.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <linux/poll.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -57,6 +58,7 @@ struct sysfs_buffer { struct sysfs_ops * ops; struct semaphore sem; int needs_read_fill; + int event; }; @@ -72,6 +74,7 @@ struct sysfs_buffer { */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { + struct sysfs_dirent * sd = dentry->d_fsdata; struct attribute * attr = to_attr(dentry); struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; @@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer if (!buffer->page) return -ENOMEM; + buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); @@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp) return 0; } +/* Sysfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, as simply seeking and reading + * again will not get new data, or reset the state of 'poll'. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Nether 'poll' or 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +{ + struct sysfs_buffer * buffer = filp->private_data; + struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); + struct sysfs_dirent * sd = filp->f_dentry->d_fsdata; + int res = 0; + + poll_wait(filp, &kobj->poll, wait); + + if (buffer->event != atomic_read(&sd->s_event)) { + res = POLLERR|POLLPRI; + buffer->needs_read_fill = 1; + } + + return res; +} + + +static struct dentry *step_down(struct dentry *dir, const char * name) +{ + struct dentry * de; + + if (dir == NULL || dir->d_inode == NULL) + return NULL; + + mutex_lock(&dir->d_inode->i_mutex); + de = lookup_one_len(name, dir, strlen(name)); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + if (IS_ERR(de)) + return NULL; + if (de->d_inode == NULL) { + dput(de); + return NULL; + } + return de; +} + +void sysfs_notify(struct kobject * k, char *dir, char *attr) +{ + struct dentry *de = k->dentry; + if (de) + dget(de); + if (de && dir) + de = step_down(de, dir); + if (de && attr) + de = step_down(de, attr); + if (de) { + struct sysfs_dirent * sd = de->d_fsdata; + if (sd) + atomic_inc(&sd->s_event); + wake_up_interruptible(&k->poll); + dput(de); + } +} +EXPORT_SYMBOL_GPL(sysfs_notify); + const struct file_operations sysfs_file_operations = { .read = sysfs_read_file, .write = sysfs_write_file, .llseek = generic_file_llseek, .open = sysfs_open_file, .release = sysfs_release, + .poll = sysfs_poll, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 32958a7c50e9..3651ffb5ec09 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 269721af02f3..c847416f6d10 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -252,6 +252,7 @@ xfs_file_sendfile_invis( STATIC ssize_t xfs_file_splice_read( struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -259,13 +260,14 @@ xfs_file_splice_read( vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval); + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval); return rval; } STATIC ssize_t xfs_file_splice_read_invis( struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -273,7 +275,7 @@ xfs_file_splice_read_invis( vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval); + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval); return rval; } @@ -281,13 +283,14 @@ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t len, unsigned int flags) { vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval); + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval); return rval; } @@ -295,13 +298,14 @@ STATIC ssize_t xfs_file_splice_write_invis( struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t len, unsigned int flags) { vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval); + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval); return rval; } diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 74a52937f208..67efe3308980 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -338,6 +338,7 @@ ssize_t xfs_splice_read( bhv_desc_t *bdp, struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t count, int flags, @@ -360,7 +361,7 @@ xfs_splice_read( int error; error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - infilp->f_pos, count, + *ppos, count, FILP_DELAY_FLAG(infilp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -368,8 +369,8 @@ xfs_splice_read( } } xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, - pipe, count, infilp->f_pos, ioflags); - ret = generic_file_splice_read(infilp, pipe, count, flags); + pipe, count, *ppos, ioflags); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -382,6 +383,7 @@ xfs_splice_write( bhv_desc_t *bdp, struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t count, int flags, int ioflags, @@ -403,7 +405,7 @@ xfs_splice_write( int error; error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), - outfilp->f_pos, count, + *ppos, count, FILP_DELAY_FLAG(outfilp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -411,8 +413,8 @@ xfs_splice_write( } } xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, - pipe, count, outfilp->f_pos, ioflags); - ret = generic_file_splice_write(pipe, outfilp, count, flags); + pipe, count, *ppos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); if (ret > 0) XFS_STATS_ADD(xs_write_bytes, ret); diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 55c689a86ad2..8f4539952350 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); -extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, +extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, - struct file *, size_t, int, int, + struct file *, loff_t *, size_t, int, int, struct cred *); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 88b09f186289..2a8e16c22353 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); -typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, +typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, - struct file *, size_t, int, int, + struct file *, loff_t *, size_t, int, int, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, int, unsigned int, void __user *); @@ -284,10 +284,10 @@ typedef struct vnodeops { rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) -#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv) \ - rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) -#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv) \ - rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 64ee07db0d5e..8558226281c4 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1942,8 +1942,10 @@ xfs_alloc_fix_freelist( /* * Allocate as many blocks as possible at once. */ - if ((error = xfs_alloc_ag_vextent(&targs))) + if ((error = xfs_alloc_ag_vextent(&targs))) { + xfs_trans_brelse(tp, agflbp); return error; + } /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1960,6 +1962,7 @@ xfs_alloc_fix_freelist( return error; } } + xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; } diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 81a05cfd77d2..1f148762eb28 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -316,6 +316,18 @@ xfs_rename( } } + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { + error = XFS_ERROR(EXDEV); + xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + goto rele_return; + } + new_parent = (src_dp != target_dp); src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index f0e09ca14139..36ea1b2094f2 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -669,31 +669,22 @@ xfs_mntupdate( xfs_mount_t *mp = XFS_BHVTOM(bdp); int error; - if (args->flags & XFSMNT_BARRIER) - mp->m_flags |= XFS_MOUNT_BARRIER; - else - mp->m_flags &= ~XFS_MOUNT_BARRIER; - - if ((vfsp->vfs_flag & VFS_RDONLY) && - !(*flags & MS_RDONLY)) { - vfsp->vfs_flag &= ~VFS_RDONLY; - - if (args->flags & XFSMNT_BARRIER) + if (!(*flags & MS_RDONLY)) { /* rw/ro -> rw */ + if (vfsp->vfs_flag & VFS_RDONLY) + vfsp->vfs_flag &= ~VFS_RDONLY; + if (args->flags & XFSMNT_BARRIER) { + mp->m_flags |= XFS_MOUNT_BARRIER; xfs_mountfs_check_barriers(mp); - } - - if (!(vfsp->vfs_flag & VFS_RDONLY) && - (*flags & MS_RDONLY)) { + } else { + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } + } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error); - xfs_quiesce_fs(mp); - - /* Ok now write out an unmount record */ xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); vfsp->vfs_flag |= VFS_RDONLY; } - return 0; } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index fa71b305ba5c..7027ae68ee38 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2663,7 +2663,7 @@ xfs_link( */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (tdp->i_d.di_projid != sip->i_d.di_projid))) { - error = XFS_ERROR(EPERM); + error = XFS_ERROR(EXDEV); goto error_return; } |